1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 /*
19  * $Id: DFAContentModel.cpp 901107 2010-01-20 08:45:02Z borisk $
20  */
21 
22 
23 // ---------------------------------------------------------------------------
24 //  Includes
25 // ---------------------------------------------------------------------------
26 #include <xercesc/util/RuntimeException.hpp>
27 #include <xercesc/framework/XMLBuffer.hpp>
28 #include <xercesc/framework/XMLElementDecl.hpp>
29 #include <xercesc/framework/XMLValidator.hpp>
30 #include <xercesc/validators/common/CMAny.hpp>
31 #include <xercesc/validators/common/CMBinaryOp.hpp>
32 #include <xercesc/validators/common/CMLeaf.hpp>
33 #include <xercesc/validators/common/CMRepeatingLeaf.hpp>
34 #include <xercesc/validators/common/CMUnaryOp.hpp>
35 #include <xercesc/validators/common/DFAContentModel.hpp>
36 #include <xercesc/validators/common/ContentSpecNode.hpp>
37 #include <xercesc/validators/common/Grammar.hpp>
38 #include <xercesc/validators/schema/SchemaSymbols.hpp>
39 #include <xercesc/validators/schema/SubstitutionGroupComparator.hpp>
40 #include <xercesc/validators/schema/XercesElementWildcard.hpp>
41 #include <xercesc/util/RefHashTableOf.hpp>
42 #include <xercesc/util/XMLInteger.hpp>
43 #include <math.h>
44 
45 XERCES_CPP_NAMESPACE_BEGIN
46 
47 struct CMStateSetHasher
48 {
getHashValCMStateSetHasher49   XMLSize_t getHashVal(const void *const key, XMLSize_t mod)
50   {
51     const CMStateSet* const pkey = (const CMStateSet*) key;
52     return ((pkey->hashCode()) % mod);
53   }
54 
equalsCMStateSetHasher55   bool equals(const void *const key1, const void *const key2)
56   {
57     const CMStateSet* const pkey1 = (const CMStateSet*) key1;
58     const CMStateSet* const pkey2 = (const CMStateSet*) key2;
59     return (*pkey1==*pkey2);
60   }
61 };
62 
63 // ---------------------------------------------------------------------------
64 //  DFAContentModel: Constructors and Destructor
65 // ---------------------------------------------------------------------------
DFAContentModel(const bool dtd,ContentSpecNode * const elemContentSpec,MemoryManager * const manager)66 DFAContentModel::DFAContentModel( const bool             dtd
67                                 , ContentSpecNode* const elemContentSpec
68                                 , MemoryManager* const   manager) :
69 
70     fElemMap(0)
71     , fElemMapType(0)
72     , fElemMapSize(0)
73     , fEmptyOk(false)
74     , fEOCPos(0)
75     , fFinalStateFlags(0)
76     , fFollowList(0)
77     , fHeadNode(0)
78     , fLeafCount(0)
79     , fLeafList(0)
80     , fLeafListType(0)
81     , fTransTable(0)
82     , fTransTableSize(0)
83     , fCountingStates(0)
84     , fDTD(dtd)
85     , fIsMixed(false)
86     , fLeafNameTypeVector(0)
87     , fMemoryManager(manager)
88 {
89     // And build the DFA data structures
90     buildDFA(elemContentSpec);
91 }
92 
DFAContentModel(const bool dtd,ContentSpecNode * const elemContentSpec,const bool isMixed,MemoryManager * const manager)93 DFAContentModel::DFAContentModel( const bool             dtd
94                                 , ContentSpecNode* const elemContentSpec
95                                 , const bool             isMixed
96                                 , MemoryManager* const   manager):
97 
98     fElemMap(0)
99     , fElemMapType(0)
100     , fElemMapSize(0)
101     , fEmptyOk(false)
102     , fEOCPos(0)
103     , fFinalStateFlags(0)
104     , fFollowList(0)
105     , fHeadNode(0)
106     , fLeafCount(0)
107     , fLeafList(0)
108     , fLeafListType(0)
109     , fTransTable(0)
110     , fTransTableSize(0)
111     , fCountingStates(0)
112     , fDTD(dtd)
113     , fIsMixed(isMixed)
114     , fLeafNameTypeVector(0)
115     , fMemoryManager(manager)
116 {
117     // And build the DFA data structures
118     buildDFA(elemContentSpec);
119 }
120 
~DFAContentModel()121 DFAContentModel::~DFAContentModel()
122 {
123     //
124     //  Clean up all the stuff that is not just temporary representation
125     //  data that was cleaned up after building the DFA.
126     //
127     fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags;
128 
129     unsigned int index;
130     for (index = 0; index < fTransTableSize; index++)
131         fMemoryManager->deallocate(fTransTable[index]); //delete [] fTransTable[index];
132     fMemoryManager->deallocate(fTransTable); //delete [] fTransTable;
133 
134     if(fCountingStates)
135     {
136         for (unsigned int j = 0; j < fTransTableSize; ++j)
137             delete fCountingStates[j];
138         fMemoryManager->deallocate(fCountingStates);
139     }
140 
141     for (index = 0; index < fLeafCount; index++)
142         delete fElemMap[index];
143     fMemoryManager->deallocate(fElemMap); //delete [] fElemMap;
144 
145     fMemoryManager->deallocate(fElemMapType); //delete [] fElemMapType;
146     fMemoryManager->deallocate(fLeafListType); //delete [] fLeafListType;
147 
148     delete fLeafNameTypeVector;
149 }
150 
151 
152 // ---------------------------------------------------------------------------
153 //  DFAContentModel: Implementation of the ContentModel virtual interface
154 // ---------------------------------------------------------------------------
155 bool
validateContent(QName ** const children,XMLSize_t childCount,unsigned int,XMLSize_t * indexFailingChild,MemoryManager * const) const156 DFAContentModel::validateContent( QName** const        children
157                                 , XMLSize_t            childCount
158                                 , unsigned int
159                                 , XMLSize_t*           indexFailingChild
160                                 , MemoryManager*    const) const
161 {
162     //
163     //  If there are no children, then either we fail on the 0th element
164     //  or we return success. It depends upon whether this content model
165     //  accepts empty content, which we determined earlier.
166     //
167     if (!childCount)
168     {
169         // success
170         if(fEmptyOk)
171             return true;
172         *indexFailingChild=0;
173         return false;
174     }
175 
176     //
177     //  Lets loop through the children in the array and move our way
178     //  through the states. Note that we use the fElemMap array to map
179     //  an element index to a state index.
180     //
181     unsigned int curState = 0;
182     unsigned int nextState = 0;
183     unsigned int loopCount = 0;
184     unsigned int childIndex = 0;
185     for (; childIndex < childCount; childIndex++)
186     {
187         // Get the current element index out
188         const QName* curElem = children[childIndex];
189         const XMLCh* curElemRawName = 0;
190         if (fDTD)
191             curElemRawName = curElem->getRawName();
192 
193         // If this is text in a Schema mixed content model, skip it.
194         if ( fIsMixed &&
195             ( curElem->getURI() == XMLElementDecl::fgPCDataElemId))
196             continue;
197 
198         // Look up this child in our element map
199         unsigned int elemIndex = 0;
200         for (; elemIndex < fElemMapSize; elemIndex++)
201         {
202             const QName* inElem  = fElemMap[elemIndex];
203             if (fDTD) {
204                 if (XMLString::equals(inElem->getRawName(), curElemRawName)) {
205                     nextState = fTransTable[curState][elemIndex];
206                     if (nextState != XMLContentModel::gInvalidTrans)
207                         break;
208                 }
209             }
210             else {
211                 ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
212                 if (type == ContentSpecNode::Leaf)
213                 {
214                     if ((inElem->getURI() == curElem->getURI()) &&
215                     (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) {
216                         nextState = fTransTable[curState][elemIndex];
217                         if (nextState != XMLContentModel::gInvalidTrans)
218                             break;
219                     }
220                 }
221                 else if ((type & 0x0f)== ContentSpecNode::Any)
222                 {
223                     nextState = fTransTable[curState][elemIndex];
224                     if (nextState != XMLContentModel::gInvalidTrans)
225                         break;
226                 }
227                 else if ((type & 0x0f) == ContentSpecNode::Any_NS)
228                 {
229                     if (inElem->getURI() == curElem->getURI())
230                     {
231                         nextState = fTransTable[curState][elemIndex];
232                         if (nextState != XMLContentModel::gInvalidTrans)
233                             break;
234                     }
235                 }
236                 else if ((type & 0x0f) == ContentSpecNode::Any_Other)
237                 {
238                     // Here we assume that empty string has id 1.
239                     //
240                     unsigned int uriId = curElem->getURI();
241                     if (uriId != 1 && uriId != inElem->getURI()) {
242                         nextState = fTransTable[curState][elemIndex];
243                         if (nextState != XMLContentModel::gInvalidTrans)
244                             break;
245                     }
246                 }
247             }
248         }//for elemIndex
249 
250         // If "nextState" is -1, we found a match, but the transition is invalid
251         if (nextState == XMLContentModel::gInvalidTrans)
252         {
253             *indexFailingChild=childIndex;
254             return false;
255         }
256 
257         // If we didn't find it, then obviously not valid
258         if (elemIndex == fElemMapSize)
259         {
260             *indexFailingChild=childIndex;
261             return false;
262         }
263 
264         unsigned int nextLoop = 0;
265         if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, 0))
266         {
267             *indexFailingChild=childIndex;
268             return false;
269         }
270 
271         curState = nextState;
272         loopCount = nextLoop;
273         nextState = 0;
274 
275     }//for childIndex
276 
277     //
278     //  We transitioned all the way through the input list. However, that
279     //  does not mean that we ended in a final state. So check whether
280     //  our ending state is a final state.
281     //
282     if (!fFinalStateFlags[curState])
283     {
284         *indexFailingChild=childIndex;
285         return false;
286     }
287 
288     // verify if we exited before the minOccurs was satisfied
289     if (fCountingStates != 0) {
290         Occurence* o = fCountingStates[curState];
291         if (o != 0 && loopCount < (unsigned int)o->minOccurs) {
292             // not enough loops on the current state to be considered final.
293             *indexFailingChild=childIndex;
294             return false;
295         }
296     }
297 
298     //success
299     return true;
300 }
301 
validateContentSpecial(QName ** const children,XMLSize_t childCount,unsigned int,GrammarResolver * const pGrammarResolver,XMLStringPool * const pStringPool,XMLSize_t * indexFailingChild,MemoryManager * const) const302 bool DFAContentModel::validateContentSpecial(QName** const            children
303                                             , XMLSize_t               childCount
304                                             , unsigned int
305                                             , GrammarResolver*  const pGrammarResolver
306                                             , XMLStringPool*    const pStringPool
307                                             , XMLSize_t*              indexFailingChild
308                                             , MemoryManager*    const) const
309 {
310 
311     SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool);
312 
313     if (childCount == 0)
314     {
315         if(fEmptyOk)
316             return true;
317         *indexFailingChild=0;
318         return false;
319     }
320 
321     //
322     //  Lets loop through the children in the array and move our way
323     //  through the states. Note that we use the fElemMap array to map
324     //  an element index to a state index.
325     //
326     unsigned int curState = 0;
327     unsigned int loopCount = 0;
328     unsigned int nextState = 0;
329     unsigned int childIndex = 0;
330     for (; childIndex < childCount; childIndex++)
331     {
332         // Get the current element index out
333         QName* curElem = children[childIndex];
334 
335         // If this is text in a Schema mixed content model, skip it.
336         if ( fIsMixed &&
337             ( curElem->getURI() == XMLElementDecl::fgPCDataElemId))
338             continue;
339 
340         // Look up this child in our element map
341         unsigned int elemIndex = 0;
342         for (; elemIndex < fElemMapSize; elemIndex++)
343         {
344             QName* inElem  = fElemMap[elemIndex];
345             ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
346             if (type == ContentSpecNode::Leaf)
347             {
348                 if (comparator.isEquivalentTo(curElem, inElem) )
349                 {
350                     nextState = fTransTable[curState][elemIndex];
351                     if (nextState != XMLContentModel::gInvalidTrans)
352                         break;
353                 }
354 
355             }
356             else if ((type & 0x0f)== ContentSpecNode::Any)
357             {
358                 nextState = fTransTable[curState][elemIndex];
359                 if (nextState != XMLContentModel::gInvalidTrans)
360                     break;
361             }
362             else if ((type & 0x0f) == ContentSpecNode::Any_NS)
363             {
364                 if (inElem->getURI() == curElem->getURI())
365                 {
366                     nextState = fTransTable[curState][elemIndex];
367                     if (nextState != XMLContentModel::gInvalidTrans)
368                         break;
369                 }
370             }
371             else if ((type & 0x0f) == ContentSpecNode::Any_Other)
372             {
373                 // Here we assume that empty string has id 1.
374                 //
375                 unsigned int uriId = curElem->getURI();
376                 if (uriId != 1 && uriId != inElem->getURI())
377                 {
378                     nextState = fTransTable[curState][elemIndex];
379                     if (nextState != XMLContentModel::gInvalidTrans)
380                         break;
381                 }
382             }
383         }//for elemIndex
384 
385         // If "nextState" is -1, we found a match, but the transition is invalid
386         if (nextState == XMLContentModel::gInvalidTrans)
387         {
388             *indexFailingChild=childIndex;
389             return false;
390         }
391 
392         // If we didn't find it, then obviously not valid
393         if (elemIndex == fElemMapSize)
394         {
395             *indexFailingChild=childIndex;
396             return false;
397         }
398 
399         unsigned int nextLoop = 0;
400         if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, &comparator))
401         {
402             *indexFailingChild=childIndex;
403             return false;
404         }
405 
406         curState = nextState;
407         loopCount = nextLoop;
408         nextState = 0;
409 
410     }//for childIndex
411 
412     //
413     //  We transitioned all the way through the input list. However, that
414     //  does not mean that we ended in a final state. So check whether
415     //  our ending state is a final state.
416     //
417     if (!fFinalStateFlags[curState])
418     {
419         *indexFailingChild=childIndex;
420         return false;
421     }
422 
423     // verify if we exited before the minOccurs was satisfied
424     if (fCountingStates != 0) {
425         Occurence* o = fCountingStates[curState];
426         if (o != 0) {
427             if (loopCount < (unsigned int)o->minOccurs) {
428                 // not enough loops on the current state.
429                 *indexFailingChild=childIndex;
430                 return false;
431             }
432         }
433     }
434 
435     //success
436     return true;
437 }
438 
handleRepetitions(const QName * const curElem,unsigned int curState,unsigned int currentLoop,unsigned int & nextState,unsigned int & nextLoop,XMLSize_t elemIndex,SubstitutionGroupComparator * comparator) const439 bool DFAContentModel::handleRepetitions(const QName* const curElem,
440                                         unsigned int curState,
441                                         unsigned int currentLoop,
442                                         unsigned int& nextState,
443                                         unsigned int& nextLoop,
444                                         XMLSize_t elemIndex,
445                                         SubstitutionGroupComparator * comparator) const
446 {
447     nextLoop = 0;
448     if (fCountingStates != 0) {
449         nextLoop = currentLoop;
450         Occurence* o = fCountingStates[curState];
451         if (o != 0) {
452             if (curState == nextState) {
453                 if (++nextLoop > (unsigned int)o->maxOccurs && o->maxOccurs != -1) {
454                     // It's likely that we looped too many times on the current state
455                     // however it's possible that we actually matched another particle
456                     // which allows the same name.
457                     //
458                     // Consider:
459                     //
460                     // <xs:sequence>
461                     //  <xs:element name="foo" type="xs:string" minOccurs="3" maxOccurs="3"/>
462                     //  <xs:element name="foo" type="xs:string" fixed="bar"/>
463                     // </xs:sequence>
464                     //
465                     // and
466                     //
467                     // <xs:sequence>
468                     //  <xs:element name="foo" type="xs:string" minOccurs="3" maxOccurs="3"/>
469                     //  <xs:any namespace="##any" processContents="skip"/>
470                     // </xs:sequence>
471                     //
472                     // In the DFA there will be two transitions from the current state which
473                     // allow "foo". Note that this is not a UPA violation. The ambiguity of which
474                     // transition to take is resolved by the current value of the counter. Since
475                     // we've already seen enough instances of the first "foo" perhaps there is
476                     // another element declaration or wildcard deeper in the element map which
477                     // matches.
478                     unsigned int tempNextState = 0;
479 
480                     while (++elemIndex < fElemMapSize) {
481                         QName* inElem  = fElemMap[elemIndex];
482                         ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
483                         if (type == ContentSpecNode::Leaf)
484                         {
485                             if(comparator!=0) {
486                                 if (comparator->isEquivalentTo(curElem, inElem) )
487                                 {
488                                     tempNextState = fTransTable[curState][elemIndex];
489                                     if (tempNextState != XMLContentModel::gInvalidTrans)
490                                         break;
491                                 }
492                             }
493                             else if (fDTD) {
494                                 if (XMLString::equals(inElem->getRawName(), curElem->getRawName())) {
495                                     tempNextState = fTransTable[curState][elemIndex];
496                                     if (tempNextState != XMLContentModel::gInvalidTrans)
497                                         break;
498                                 }
499                             }
500                             else {
501                                 if ((inElem->getURI() == curElem->getURI()) &&
502                                 (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) {
503                                     tempNextState = fTransTable[curState][elemIndex];
504                                     if (tempNextState != XMLContentModel::gInvalidTrans)
505                                         break;
506                                 }
507                             }
508                         }
509                         else if ((type & 0x0f)== ContentSpecNode::Any)
510                         {
511                             tempNextState = fTransTable[curState][elemIndex];
512                             if (tempNextState != XMLContentModel::gInvalidTrans)
513                                 break;
514                         }
515                         else if ((type & 0x0f) == ContentSpecNode::Any_NS)
516                         {
517                             if (inElem->getURI() == curElem->getURI())
518                             {
519                                 tempNextState = fTransTable[curState][elemIndex];
520                                 if (tempNextState != XMLContentModel::gInvalidTrans)
521                                     break;
522                             }
523                         }
524                         else if ((type & 0x0f) == ContentSpecNode::Any_Other)
525                         {
526                             // Here we assume that empty string has id 1.
527                             //
528                             unsigned int uriId = curElem->getURI();
529                             if (uriId != 1 && uriId != inElem->getURI())
530                             {
531                                 tempNextState = fTransTable[curState][elemIndex];
532                                 if (tempNextState != XMLContentModel::gInvalidTrans)
533                                     break;
534                             }
535                         }
536                     }
537 
538                     // if we still can't find a match, report the error
539                     if (elemIndex == fElemMapSize)
540                         return false;
541 
542                     // if we found a match, set the next state and reset the
543                     // counter if the next state is a counting state.
544                     nextState = tempNextState;
545                     Occurence* o = fCountingStates[nextState];
546                     if (o != 0) {
547                       nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
548                     }
549                 }
550             }
551             else if (nextLoop < (unsigned int)o->minOccurs) {
552                 // not enough loops on the current state.
553                 return false;
554             }
555             else {
556                 // Exiting a counting state. If we're entering a new
557                 // counting state, reset the counter.
558                 o = fCountingStates[nextState];
559                 if (o != 0) {
560                   nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
561                 }
562             }
563         }
564         else {
565             o = fCountingStates[nextState];
566             if (o != 0) {
567                 // Entering a new counting state. Reset the counter.
568                 // If we've already seen one instance of the looping
569                 // particle set the counter to 1, otherwise set it
570                 // to 0.
571               nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
572             }
573         }
574     }
575     return true;
576 }
577 
578 // ---------------------------------------------------------------------------
579 //  DFAContentModel: Private helper methods
580 // ---------------------------------------------------------------------------
buildDFA(ContentSpecNode * const curNode)581 void DFAContentModel::buildDFA(ContentSpecNode* const curNode)
582 {
583     unsigned int index;
584 
585     //
586     //  The first step we need to take is to rewrite the content model using
587     //  our CMNode objects, and in the process get rid of any repetition short
588     //  cuts, converting them into '*' style repetitions or getting rid of
589     //  repetitions altogether.
590     //
591     //  The conversions done are:
592     //
593     //  x+ -> (x|x*)
594     //  x? -> (x|epsilon)
595     //
596     //  This is a relatively complex scenario. What is happening is that we
597     //  create a top level binary node of which the special EOC value is set
598     //  as the right side node. The the left side is set to the rewritten
599     //  syntax tree. The source is the original content model info from the
600     //  decl pool. The rewrite is done by buildSyntaxTree() which recurses the
601     //  decl pool's content of the element and builds a new tree in the
602     //  process.
603     //
604     //  Note that, during this operation, we set each non-epsilon leaf node's
605     //  DFA state position and count the number of such leafs, which is left
606     //  in the fLeafCount member.
607     //
608     fLeafCount=countLeafNodes(curNode);
609     fEOCPos = fLeafCount++;
610 
611     //  We need to build an array of references to the non-epsilon
612     //  leaf nodes. We will put them in the array according to their position values
613     //
614     fLeafList = (CMLeaf**) fMemoryManager->allocate(fLeafCount*sizeof(CMLeaf*)); //new CMLeaf*[fLeafCount];
615     fLeafListType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
616     (
617         fLeafCount * sizeof(ContentSpecNode::NodeTypes)
618     ); //new ContentSpecNode::NodeTypes[fLeafCount];
619     //
620     //  And, moving onward... We now need to build the follow position sets
621     //  for all the nodes. So we allocate an array of pointers to state sets,
622     //  one for each leaf node (i.e. each significant DFA position.)
623     //
624     fFollowList = (CMStateSet**) fMemoryManager->allocate
625     (
626         fLeafCount * sizeof(CMStateSet*)
627     ); //new CMStateSet*[fLeafCount];
628     for (index = 0; index < fLeafCount; index++)
629         fFollowList[index] = new (fMemoryManager) CMStateSet(fLeafCount, fMemoryManager);
630 
631     //  The buildSyntaxTree function will recursively iterate over the ContentSpecNode
632     //  and build the CMNode hierarchy; it will also put every leaf node in the fLeafList
633     //  array, then calculate the first and last position sets of each node. This is
634     //  cached away in each of the nodes.
635     //
636     //  Along the way we also set the leaf count in each node as the maximum
637     //  state count. They must know this in order to create their first/last
638     //  position sets.
639     //
640     unsigned int counter=0;
641     CMNode* nodeOrgContent = buildSyntaxTree(curNode, counter);
642     //
643     //  Check to see whether this content model can handle an empty content,
644     //  which is something we need to optimize by looking now before we
645     //  throw away the info that would tell us that.
646     //
647     //  If the left node of the head (the top level of the original content)
648     //  is nullable, then its true.
649     //
650     fEmptyOk = nodeOrgContent->isNullable();
651 
652     //
653     //  And handle specially the EOC node, which also must be numbered and
654     //  counted as a non-epsilon leaf node. It could not be handled in the
655     //  above tree build because it was created before all that started. We
656     //  save the EOC position since its used during the DFA building loop.
657     //
658     CMLeaf* nodeEOC = new (fMemoryManager) CMLeaf
659     (
660         new (fMemoryManager) QName
661         (
662             XMLUni::fgZeroLenString
663             , XMLUni::fgZeroLenString
664             , XMLContentModel::gEOCFakeId
665             , fMemoryManager
666         )
667         , fEOCPos
668         , true
669         , fLeafCount
670         , fMemoryManager
671     );
672     fHeadNode = new (fMemoryManager) CMBinaryOp
673     (
674         ContentSpecNode::Sequence
675         , nodeOrgContent
676         , nodeEOC
677         , fLeafCount
678         , fMemoryManager
679     );
680 
681     //  Put also the final EOC node in the leaf array
682     fLeafList[counter] = new (fMemoryManager) CMLeaf
683     (
684         nodeEOC->getElement()
685         , nodeEOC->getPosition()
686         , fLeafCount
687         , fMemoryManager
688     );
689     fLeafListType[counter] = ContentSpecNode::Leaf;
690 
691     //
692     //  Now handle our top level. We use our left child's last pos set and our
693     //  right child's first pos set, so get them now for convenience.
694     //
695     const CMStateSet& last  = nodeOrgContent->getLastPos();
696     const CMStateSet& first = nodeEOC->getFirstPos();
697 
698     //
699     //  Now, for every position which is in our left child's last set
700     //  add all of the states in our right child's first set to the
701     //  follow set for that position.
702     //
703     CMStateSetEnumerator enumLast(&last);
704     while(enumLast.hasMoreElements())
705     {
706         XMLSize_t index=enumLast.nextElement();
707         *fFollowList[index] |= first;
708     }
709 
710     //
711     //  And finally the big push... Now we build the DFA using all the states
712     //  and the tree we've built up. First we set up the various data
713     //  structures we are going to use while we do this.
714     //
715     //  First of all we need an array of unique element ids in our content
716     //  model. For each transition table entry, we need a set of contiguous
717     //  indices to represent the transitions for a particular input element.
718     //  So we need to a zero based range of indexes that map to element types.
719     //  This element map provides that mapping.
720     //
721     fElemMap = (QName**) fMemoryManager->allocate
722     (
723         fLeafCount * sizeof(QName*)
724     ); //new QName*[fLeafCount];
725     fElemMapType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
726     (
727         fLeafCount * sizeof(ContentSpecNode::NodeTypes)
728     ); //new ContentSpecNode::NodeTypes[fLeafCount];
729     fElemMapSize = 0;
730 
731     Occurence** elemOccurenceMap=0;
732     for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++)
733     {
734         fElemMap[outIndex] = new (fMemoryManager) QName(fMemoryManager);
735 
736         if ( (fLeafListType[outIndex] & 0x0f) != ContentSpecNode::Leaf )
737             if (!fLeafNameTypeVector)
738                 fLeafNameTypeVector = new (fMemoryManager) ContentLeafNameTypeVector(fMemoryManager);
739 
740         // Get the current leaf's element index
741         CMLeaf* leaf=fLeafList[outIndex];
742         const QName* element = leaf->getElement();
743         const XMLCh* elementRawName = 0;
744         if (fDTD && element)
745             elementRawName = element->getRawName();
746 
747         // See if the current leaf node's element index is in the list
748         unsigned int inIndex = 0;
749 
750         for (; inIndex < fElemMapSize; inIndex++)
751         {
752             const QName* inElem = fElemMap[inIndex];
753             if (fDTD) {
754                 if (XMLString::equals(inElem->getRawName(), elementRawName)) {
755                     break;
756                 }
757             }
758             else {
759                 if ((fElemMapType[inIndex] == fLeafListType[outIndex]) &&
760                     (inElem->getURI() == element->getURI()) &&
761                     (XMLString::equals(inElem->getLocalPart(), element->getLocalPart()))) {
762                     break;
763                 }
764             }
765         }
766 
767         // If it was not in the list, then add it and bump the map size
768         if (inIndex == fElemMapSize)
769         {
770             fElemMap[fElemMapSize]->setValues(*element);
771             if(leaf->isRepeatableLeaf())
772             {
773                 if (elemOccurenceMap == 0) {
774                     elemOccurenceMap = (Occurence**)fMemoryManager->allocate(fLeafCount*sizeof(Occurence*));
775                     memset(elemOccurenceMap, 0, fLeafCount*sizeof(Occurence*));
776                 }
777                 elemOccurenceMap[fElemMapSize] = new (fMemoryManager) Occurence(((CMRepeatingLeaf*)leaf)->getMinOccurs(), ((CMRepeatingLeaf*)leaf)->getMaxOccurs(), fElemMapSize);
778             }
779             fElemMapType[fElemMapSize] = fLeafListType[outIndex];
780             ++fElemMapSize;
781         }
782     }
783 
784     // set up the fLeafNameTypeVector object if there is one.
785     if (fLeafNameTypeVector) {
786         fLeafNameTypeVector->setValues(fElemMap, fElemMapType, fElemMapSize);
787     }
788 
789     /***
790      * Optimization(Jan, 2001); We sort fLeafList according to
791      * elemIndex which is *uniquely* associated to each leaf.
792      * We are *assuming* that each element appears in at least one leaf.
793      **/
794     // don't forget to delete it
795 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
796     int *fLeafSorter = (int*) fMemoryManager->allocate
797     (
798         (fLeafCount + fElemMapSize) * sizeof(int)
799     ); //new int[fLeafCount + fElemMapSize];
800     unsigned int fSortCount = 0;
801 
802     for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
803     {
804         const QName* element = fElemMap[elemIndex];
805         const XMLCh* elementRawName = 0;
806         if (fDTD && element)
807             elementRawName = element->getRawName();
808 
809         for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
810         {
811             const QName* leaf = fLeafList[leafIndex]->getElement();
812             if (fDTD) {
813                 if (XMLString::equals(leaf->getRawName(), elementRawName)) {
814                     fLeafSorter[fSortCount++] = leafIndex;
815                 }
816             }
817             else {
818                 if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
819                     (leaf->getURI() == element->getURI()) &&
820                     (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
821                       fLeafSorter[fSortCount++] = leafIndex;
822                 }
823             }
824         }
825         fLeafSorter[fSortCount++] = -1;
826     }
827 #endif
828 
829     // instead of using a single array with -1 to separate elements, use a bidimensional map
830     unsigned int** fLeafSorter = (unsigned int**)fMemoryManager->allocate(fElemMapSize * sizeof(unsigned int*));
831     unsigned int* tmpSorter = (unsigned int*)fMemoryManager->allocate(fLeafCount * sizeof(unsigned int));
832     for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
833     {
834         const QName* element = fElemMap[elemIndex];
835         const XMLCh* elementRawName = 0;
836         if (fDTD && element)
837             elementRawName = element->getRawName();
838 
839         unsigned int fSortCount=0;
840         for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
841         {
842             const QName* leaf = fLeafList[leafIndex]->getElement();
843             if (fDTD) {
844                 if (XMLString::equals(leaf->getRawName(), elementRawName)) {
845                     tmpSorter[fSortCount++] = leafIndex;
846                 }
847             }
848             else {
849                 if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
850                     (leaf->getURI() == element->getURI()) &&
851                     (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
852                       tmpSorter[fSortCount++] = leafIndex;
853                 }
854             }
855         }
856 
857         fLeafSorter[elemIndex]=(unsigned int*)fMemoryManager->allocate((fSortCount+1) * sizeof(unsigned int));
858         fLeafSorter[elemIndex][0]=fSortCount;
859         for (unsigned int index=0;index<fSortCount;index++)
860             fLeafSorter[elemIndex][index+1]=tmpSorter[index];
861     }
862     fMemoryManager->deallocate(tmpSorter);
863 
864     //
865     //  Next lets create some arrays, some that that hold transient info
866     //  during the DFA build and some that are permament. These are kind of
867     //  sticky since we cannot know how big they will get, but we don't want
868     //  to use any collection type classes because of performance.
869     //
870     //  Basically they will probably be about fLeafCount*2 on average, but can
871     //  be as large as 2^(fLeafCount*2), worst case. So we start with
872     //  fLeafCount*4 as a middle ground. This will be very unlikely to ever
873     //  have to expand though, it if does, the overhead will be somewhat ugly.
874     //
875     unsigned int curArraySize = fLeafCount * 4;
876     CMStateSet** statesToDo = (CMStateSet**)
877         fMemoryManager->allocate
878         (
879             curArraySize * sizeof(CMStateSet*)
880         ); //new const CMStateSet*[curArraySize];
881     fFinalStateFlags = (bool*) fMemoryManager->allocate
882     (
883         curArraySize * sizeof(bool)
884     ); //new bool[curArraySize];
885     fTransTable = (unsigned int**) fMemoryManager->allocate
886     (
887         curArraySize * sizeof(unsigned int*)
888     ); //new unsigned int*[curArraySize];
889 
890     //
891     //  Ok we start with the initial set as the first pos set of the head node
892     //  (which is the seq node that holds the content model and the EOC node.)
893     //
894     CMStateSet* setT = new (fMemoryManager) CMStateSet(fHeadNode->getFirstPos());
895 
896     //
897     // Note on memory leak: Bugzilla#2707:
898     // ===================================
899     // The CMBinary, pointed to by fHeadNode, shall be released by
900     // deleted by itself.
901     //
902     // fLeafList[] maintains its **OWN** copy of CMLeaf to avoid double deletion
903     // of CMLeaf.
904     //
905 
906     delete fHeadNode;
907 
908     //
909     //  Init our two state flags. Basically the unmarked state counter is
910     //  always chasing the current state counter. When it catches up, that
911     //  means we made a pass through that did not add any new states to the
912     //  lists, at which time we are done. We could have used a expanding array
913     //  of flags which we used to mark off states as we complete them, but
914     //  this is easier though less readable maybe.
915     //
916     unsigned int unmarkedState = 0;
917     unsigned int curState = 0;
918 
919     //
920     //  Init the first transition table entry, and put the initial state
921     //  into the states to do list, then bump the current state.
922     //
923     fTransTable[curState] = makeDefStateList();
924     statesToDo[curState] = setT;
925     curState++;
926 
927     //
928     // the stateTable is an auxiliary means to fast
929     // identification of new state created (instead
930     // of sequential loop statesToDo to find out),
931     // while the role that statesToDo plays remain unchanged.
932     //
933     RefHashTableOf<XMLInteger, CMStateSetHasher> *stateTable =
934         new (fMemoryManager) RefHashTableOf<XMLInteger, CMStateSetHasher>
935         (
936             curArraySize
937             , true
938             , fMemoryManager
939         );
940     //stateTable->put((CMStateSet*)setT, new (fMemoryManager) XMLInteger(0));
941 
942     //
943     //  Ok, almost done with the algorithm from hell... We now enter the
944     //  loop where we go until the states done counter catches up with
945     //  the states to do counter.
946     //
947     CMStateSet* newSet = 0;
948     while (unmarkedState < curState)
949     {
950         //
951         //  Get the next unmarked state out of the list of states to do.
952         //  And get the associated transition table entry.
953         //
954         setT = statesToDo[unmarkedState];
955         unsigned int* transEntry = fTransTable[unmarkedState];
956 
957         // Mark this one final if it contains the EOC state
958         fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos);
959 
960         // Bump up the unmarked state count, marking this state done
961         unmarkedState++;
962 
963 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
964         // Optimization(Jan, 2001)
965         unsigned int sorterIndex = 0;
966         // Optimization(Jan, 2001)
967 #endif
968 
969         // Loop through each possible input symbol in the element map
970         for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
971         {
972             //
973             //  Build up a set of states which is the union of all of the
974             //  follow sets of DFA positions that are in the current state. If
975             //  we gave away the new set last time through then create a new
976             //  one. Otherwise, zero out the existing one.
977             //
978             if (!newSet)
979                 newSet = new (fMemoryManager) CMStateSet
980                 (
981                     fLeafCount
982                     , fMemoryManager
983                 );
984             else
985                 newSet->zeroBits();
986 
987 #ifdef OBSOLETED
988 // unoptimized code
989             for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
990             {
991                 // If this leaf index (DFA position) is in the current set...
992                 if (setT->getBit(leafIndex))
993                 {
994                     //
995                     //  If this leaf is the current input symbol, then we want
996                     //  to add its follow list to the set of states to transition
997                     //  to from the current state.
998                     //
999                     const QName* leaf = fLeafList[leafIndex]->getElement();
1000                     const QName* element = fElemMap[elemIndex];
1001                     if (fDTD) {
1002                         if (XMLString::equals(leaf->getRawName(), element->getRawName())) {
1003                             *newSet |= *fFollowList[leafIndex];
1004                         }
1005                     }
1006                     else {
1007                         if ((leaf->getURI() == element->getURI()) &&
1008                             (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
1009                             *newSet |= *fFollowList[leafIndex];
1010                         }
1011                     }
1012                 }
1013             } // for leafIndex
1014 #endif
1015 
1016 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
1017             // Optimization(Jan, 2001)
1018             int leafIndex = fLeafSorter[sorterIndex++];
1019 
1020             while (leafIndex != -1)
1021             {
1022                 // If this leaf index (DFA position) is in the current set...
1023                 if (setT->getBit(leafIndex))
1024                 {
1025                     //
1026                     //  If this leaf is the current input symbol, then we
1027                     //  want to add its follow list to the set of states to
1028                     //  transition to from the current state.
1029                     //
1030                     *newSet |= *fFollowList[leafIndex];
1031                 }
1032                 leafIndex = fLeafSorter[sorterIndex++];
1033             } // while (leafIndex != -1)
1034 #endif
1035 
1036             unsigned int* fLeafIndexes=fLeafSorter[elemIndex];
1037             unsigned int fNumItems=fLeafIndexes[0];
1038             if(fNumItems!=0)
1039             {
1040                 // The algorithm requires finding the leaf that is present both in the bitfield of the current state, and in the
1041                 // list of places where the currently tested item can appear. When this occurs, the follow list of this parent item
1042                 // is added to the bitfield representing the next state.
1043                 // Both the bitfield and the list of places are sorted, so we can analyze them in two ways; either iterating over the
1044                 // parent items, testing the bitfield for the existence of the parent (N times a constant Tb), or by iterating over the
1045                 // bitfield (restricted to the range of the sorted list of places), using a binary search to locate the leaf in the
1046                 // sorted list of places (M times log(N) testing operations Ts)
1047                 // Assuming that the time to test a bit is roughly the same of the time needed to compute the average of two integers,
1048                 // plus a couple of comparisons and additions, we compare N agains M*log(N) to decide which algorithm should be faster given
1049                 // the two sets
1050                 if(fNumItems <= setT->getBitCountInRange(fLeafIndexes[1], fLeafIndexes[fNumItems])*log((float)fNumItems))
1051                 {
1052                     for(unsigned int i=1; i<=fNumItems; ++i)
1053                         if(setT->getBit(fLeafIndexes[i]))
1054                         {
1055                             //
1056                             //  If this leaf is the current input symbol, then we
1057                             //  want to add its follow list to the set of states to
1058                             //  transition to from the current state.
1059                             //
1060                             *newSet |= *fFollowList[ fLeafIndexes[i] ];
1061                         }
1062                 }
1063                 else
1064                 {
1065                     // Further optimization: given that the bitfield enumerator returns the numbers in order,
1066                     // every time we raise the lower marker we know it will true also for the next bits, so
1067                     // the next binary search will not start from 1 but from this index
1068                     unsigned int lowIndex = 1;
1069                     // Start the enumerator from the first index in the sorted list of places,
1070                     // as nothing before that point will match
1071                     CMStateSetEnumerator enumBits(setT, fLeafIndexes[1]);
1072                     while(enumBits.hasMoreElements())
1073                     {
1074                         unsigned int bitIndex=enumBits.nextElement();
1075                         // if this leaf is greater than the last index in the sorted list of places,
1076                         // nothing can be found from now on, so get out of here
1077                         if(bitIndex > fLeafIndexes[fNumItems])
1078                             break;
1079 
1080                         // Check if this leaf index (DFA position) is in the current set
1081                         // (using binary search: the indexes are sorted)
1082                         unsigned int first=lowIndex,last=fNumItems,i;
1083                         while(first<=last)
1084                         {
1085                             i=(first+last)/2;
1086                             if(fLeafIndexes[i]>bitIndex)
1087                                 last=i-1;
1088                             else if(fLeafIndexes[i]<bitIndex)
1089                                 lowIndex=first=i+1;
1090                             else
1091                             {
1092                                 //
1093                                 //  If this leaf is the current input symbol, then we
1094                                 //  want to add its follow list to the set of states to
1095                                 //  transition to from the current state.
1096                                 //
1097                                 *newSet |= *fFollowList[bitIndex];
1098                                 break;
1099                             }
1100                         }
1101                     }
1102                 }
1103             }
1104 
1105             //
1106             //  If this new set is not empty, then see if its in the list
1107             //  of states to do. If not, then add it.
1108             //
1109             if (!newSet->isEmpty())
1110             {
1111                 //
1112                 //  Search the 'states to do' list to see if this new
1113                 //  state set is already in there.
1114                 //
1115                 /***
1116                 unsigned int stateIndex = 0;
1117                 for (; stateIndex < curState; stateIndex++)
1118                 {
1119                     if (*statesToDo[stateIndex] == *newSet)
1120                         break;
1121                 }
1122                 ***/
1123 
1124                 XMLInteger *stateObj = stateTable->get(newSet);
1125                 unsigned int stateIndex = (stateObj == 0 ? curState : stateObj->intValue());
1126 
1127                 // If we did not find it, then add it
1128                 if (stateIndex == curState)
1129                 {
1130                     //
1131                     //  Put this new state into the states to do and init
1132                     //  a new entry at the same index in the transition
1133                     //  table.
1134                     //
1135                     statesToDo[curState] = newSet;
1136                     fTransTable[curState] = makeDefStateList();
1137                     stateTable->put
1138                     (
1139                         newSet
1140                         , new (fMemoryManager) XMLInteger(curState)
1141                     );
1142 
1143                     // We now have a new state to do so bump the count
1144                     curState++;
1145 
1146                     //
1147                     //  Null out the new set to indicate we adopted it. This
1148                     //  will cause the creation of a new set on the next time
1149                     //  around the loop.
1150                     //
1151                     newSet = 0;
1152                 }
1153 
1154                 //
1155                 //  Now set this state in the transition table's entry for this
1156                 //  element (using its index), with the DFA state we will move
1157                 //  to from the current state when we see this input element.
1158                 //
1159                 transEntry[elemIndex] = stateIndex;
1160 
1161                 // Expand the arrays if we're full
1162                 if (curState == curArraySize)
1163                 {
1164                     //
1165                     //  Yikes, we overflowed the initial array size, so we've
1166                     //  got to expand all of these arrays. So adjust up the
1167                     //  size by 50% and allocate new arrays.
1168                     //
1169                     const unsigned int newSize = (unsigned int)(curArraySize * 1.5);
1170                     CMStateSet** newToDo = (CMStateSet**)
1171                         fMemoryManager->allocate
1172                         (
1173                             newSize * sizeof(CMStateSet*)
1174                         ); //new const CMStateSet*[newSize];
1175                     bool* newFinalFlags = (bool*) fMemoryManager->allocate
1176                     (
1177                         newSize * sizeof(bool)
1178                     ); //new bool[newSize];
1179                     unsigned int** newTransTable = (unsigned int**)
1180                         fMemoryManager->allocate
1181                         (
1182                             newSize * sizeof(unsigned int*)
1183                         ); //new unsigned int*[newSize];
1184 
1185                     // Copy over all of the existing content
1186                     for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++)
1187                     {
1188                         newToDo[expIndex] = statesToDo[expIndex];
1189                         newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
1190                         newTransTable[expIndex] = fTransTable[expIndex];
1191                     }
1192 
1193                     // Clean up the old stuff
1194                     fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
1195                     fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags;
1196                     fMemoryManager->deallocate(fTransTable); //delete [] fTransTable;
1197 
1198                     // Store the new array size and pointers
1199                     curArraySize = newSize;
1200                     statesToDo = newToDo;
1201                     fFinalStateFlags = newFinalFlags;
1202                     fTransTable = newTransTable;
1203                 } //if (curState == curArraySize)
1204             } //if (!newSet->isEmpty())
1205         } // for elemIndex
1206     } //while
1207 
1208     // Store the current state count in the trans table size
1209     fTransTableSize = curState;
1210 
1211     //
1212     // Fill in the occurence information for each looping state
1213     // if we're using counters.
1214     //
1215     if (elemOccurenceMap != 0) {
1216         fCountingStates = (Occurence**)fMemoryManager->allocate(fTransTableSize*sizeof(Occurence));
1217         memset(fCountingStates, 0, fTransTableSize*sizeof(Occurence*));
1218         for (unsigned int i = 0; i < fTransTableSize; ++i) {
1219             unsigned int * transitions = fTransTable[i];
1220             for (unsigned int j = 0; j < fElemMapSize; ++j) {
1221                 if (i == transitions[j]) {
1222                     Occurence* old=elemOccurenceMap[j];
1223                     if(old!=0)
1224                         fCountingStates[i] = new (fMemoryManager) Occurence(old->minOccurs, old->maxOccurs, old->elemIndex);
1225                     break;
1226                 }
1227             }
1228         }
1229         for (unsigned int j = 0; j < fLeafCount; ++j) {
1230             if(elemOccurenceMap[j]!=0)
1231                 delete elemOccurenceMap[j];
1232         }
1233         fMemoryManager->deallocate(elemOccurenceMap);
1234     }
1235 
1236     // If the last temp set was not stored, then clean it up
1237     if (newSet)
1238         delete newSet;
1239 
1240     //
1241     //  Now we can clean up all of the temporary data that was needed during
1242     //  DFA build.
1243     //
1244 
1245     for (index = 0; index < fLeafCount; index++)
1246         delete fFollowList[index];
1247     fMemoryManager->deallocate(fFollowList); //delete [] fFollowList;
1248 
1249     //
1250     // removeAll() will delete all data, XMLInteger,
1251     // while the keys are to be deleted by the
1252     // deletion of statesToDo.
1253     //
1254     delete stateTable;
1255 
1256     for (index = 0; index < curState; index++)
1257         delete statesToDo[index];
1258     fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
1259 
1260     for (index = 0; index < fLeafCount; index++)
1261         delete fLeafList[index];
1262     fMemoryManager->deallocate(fLeafList); //delete [] fLeafList;
1263 
1264 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
1265     fMemoryManager->deallocate(fLeafSorter); //delete [] fLeafSorter;
1266 #endif
1267     for (index=0; index < fElemMapSize; index++)
1268         fMemoryManager->deallocate(fLeafSorter[index]);
1269     fMemoryManager->deallocate(fLeafSorter);
1270 }
1271 
countLeafNodes(ContentSpecNode * const curNode)1272 unsigned int DFAContentModel::countLeafNodes(ContentSpecNode* const curNode)
1273 {
1274     unsigned int count = 0;
1275 
1276     // Get the spec type of the passed node
1277     const ContentSpecNode::NodeTypes curType = curNode->getType();
1278 
1279     if ((curType & 0x0f) == ContentSpecNode::Any
1280         || (curType & 0x0f) == ContentSpecNode::Any_Other
1281         || (curType & 0x0f) == ContentSpecNode::Any_NS
1282         || curType == ContentSpecNode::Leaf
1283         || curType == ContentSpecNode::Loop)
1284     {
1285         count++;
1286     }
1287     else
1288     {
1289         //
1290         //  Its not a leaf, so we have to recurse its left and maybe right
1291         //  nodes. Save both values before we recurse and trash the node.
1292         //
1293         ContentSpecNode* leftNode = curNode->getFirst();
1294         ContentSpecNode* rightNode = curNode->getSecond();
1295 
1296         // Detect if we have a deep tree that can be analyzed using a loop instead of recursion
1297         unsigned int nLoopCount=0;
1298         ContentSpecNode* cursor=curNode;
1299         while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode)
1300         {
1301             nLoopCount++;
1302             cursor=cursor->getFirst();
1303         }
1304         if(nLoopCount!=0)
1305         {
1306             count += countLeafNodes(cursor);
1307             for(unsigned int i=0;i<nLoopCount;i++)
1308                 count += countLeafNodes(rightNode);
1309             return count;
1310         }
1311         if(leftNode)
1312             count+=countLeafNodes(leftNode);
1313         if(rightNode)
1314             count+=countLeafNodes(rightNode);
1315     }
1316     return count;
1317 }
1318 
buildSyntaxTree(ContentSpecNode * const curNode,unsigned int & curIndex)1319 CMNode* DFAContentModel::buildSyntaxTree(ContentSpecNode* const curNode
1320                                          , unsigned int&        curIndex)
1321 {
1322     // Initialize a return node pointer
1323     CMNode* retNode = 0;
1324 
1325     // Get the spec type of the passed node
1326     const ContentSpecNode::NodeTypes curType = curNode->getType();
1327 
1328     if ((curType & 0x0f) == ContentSpecNode::Any
1329         || (curType & 0x0f) == ContentSpecNode::Any_Other
1330         || (curType & 0x0f) == ContentSpecNode::Any_NS)
1331     {
1332         retNode = new (fMemoryManager) CMAny
1333         (
1334             curType
1335             , curNode->getElement()->getURI()
1336             , curIndex
1337             , fLeafCount
1338             , fMemoryManager
1339         );
1340         fLeafList[curIndex] = new (fMemoryManager) CMLeaf
1341         (
1342             new (fMemoryManager) QName
1343             (
1344                 XMLUni::fgZeroLenString
1345                 , XMLUni::fgZeroLenString
1346                 , curNode->getElement()->getURI()
1347                 , fMemoryManager
1348             )
1349             , curIndex
1350             , true
1351             , fLeafCount
1352             , fMemoryManager
1353         );
1354         fLeafListType[curIndex] = curType;
1355         ++curIndex;
1356     }
1357     else if (curType == ContentSpecNode::Leaf)
1358     {
1359         //
1360         //  Create a new leaf node, and pass it the current leaf count, which
1361         //  is its DFA state position. Bump the leaf count after storing it.
1362         //  This makes the positions zero based since we store first and then
1363         //  increment.
1364         //
1365         retNode = new (fMemoryManager) CMLeaf
1366         (
1367             curNode->getElement()
1368             , curIndex
1369             , fLeafCount
1370             , fMemoryManager
1371         );
1372         fLeafList[curIndex] = new (fMemoryManager) CMLeaf
1373         (
1374             curNode->getElement()
1375             , curIndex
1376             , fLeafCount
1377             , fMemoryManager
1378         );
1379         fLeafListType[curIndex] = ContentSpecNode::Leaf;
1380         ++curIndex;
1381     }
1382     else if (curType == ContentSpecNode::Loop)
1383     {
1384         //
1385         //  Create a new leaf node, and pass it the current leaf count, which
1386         //  is its DFA state position. Bump the leaf count after storing it.
1387         //  This makes the positions zero based since we store first and then
1388         //  increment.
1389         //
1390         retNode = new (fMemoryManager) CMRepeatingLeaf
1391         (
1392             curNode->getFirst()->getElement()
1393             , curNode->getMinOccurs()
1394             , curNode->getMaxOccurs()
1395             , curIndex
1396             , fLeafCount
1397             , fMemoryManager
1398         );
1399         fLeafList[curIndex] = new (fMemoryManager) CMRepeatingLeaf
1400         (
1401             curNode->getFirst()->getElement()
1402             , curNode->getMinOccurs()
1403             , curNode->getMaxOccurs()
1404             , curIndex
1405             , fLeafCount
1406             , fMemoryManager
1407         );
1408         fLeafListType[curIndex] = curNode->getFirst()->getType();
1409         ++curIndex;
1410     }
1411      else
1412     {
1413         //
1414         //  Its not a leaf, so we have to recurse its left and maybe right
1415         //  nodes. Save both values before we recurse and trash the node.
1416         //
1417         ContentSpecNode* leftNode = curNode->getFirst();
1418         ContentSpecNode* rightNode = curNode->getSecond();
1419 
1420         // Detect if we have a deep tree that can be analyzed using a loop instead of recursion
1421         unsigned int nLoopCount=0;
1422         ContentSpecNode* cursor=curNode;
1423         while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode)
1424         {
1425             nLoopCount++;
1426             cursor=cursor->getFirst();
1427         }
1428         if(nLoopCount!=0)
1429         {
1430             retNode = buildSyntaxTree(cursor, curIndex);
1431             for(unsigned int i=0;i<nLoopCount;i++)
1432             {
1433                 CMNode* newRight = buildSyntaxTree(rightNode, curIndex);
1434                 //
1435                 //  Now handle our level. We use our left child's last pos set and our
1436                 //  right child's first pos set, so get them now for convenience.
1437                 //
1438                 const CMStateSet& last  = retNode->getLastPos();
1439                 const CMStateSet& first = newRight->getFirstPos();
1440 
1441                 //
1442                 //  Now, for every position which is in our left child's last set
1443                 //  add all of the states in our right child's first set to the
1444                 //  follow set for that position.
1445                 //
1446                 CMStateSetEnumerator enumLast(&last);
1447                 while(enumLast.hasMoreElements())
1448                 {
1449                     XMLSize_t index=enumLast.nextElement();
1450                     *fFollowList[index] |= first;
1451                 }
1452                 retNode = new (fMemoryManager) CMBinaryOp
1453                 (
1454                     ContentSpecNode::Sequence
1455                     , retNode
1456                     , newRight
1457                     , fLeafCount
1458                     , fMemoryManager
1459                 );
1460             }
1461             return retNode;
1462         }
1463 
1464         if (((curType & 0x0f) == ContentSpecNode::Choice)
1465         ||   ((curType & 0x0f) == ContentSpecNode::Sequence))
1466         {
1467             //
1468             //  Recurse on both children, and return a binary op node with the
1469             //  two created sub nodes as its children. The node type is the
1470             //  same type as the source.
1471             //
1472             CMNode* newLeft = buildSyntaxTree(leftNode, curIndex);
1473             CMNode* newRight = buildSyntaxTree(rightNode, curIndex);
1474             if(((curType & 0x0f) == ContentSpecNode::Sequence))
1475             {
1476                 //
1477                 //  Now handle our level. We use our left child's last pos set and our
1478                 //  right child's first pos set, so get them now for convenience.
1479                 //
1480                 const CMStateSet& last  = newLeft->getLastPos();
1481                 const CMStateSet& first = newRight->getFirstPos();
1482 
1483                 //
1484                 //  Now, for every position which is in our left child's last set
1485                 //  add all of the states in our right child's first set to the
1486                 //  follow set for that position.
1487                 //
1488                 CMStateSetEnumerator enumLast(&last);
1489                 while(enumLast.hasMoreElements())
1490                 {
1491                     XMLSize_t index=enumLast.nextElement();
1492                     *fFollowList[index] |= first;
1493                 }
1494             }
1495             retNode = new (fMemoryManager) CMBinaryOp
1496             (
1497                 curType
1498                 , newLeft
1499                 , newRight
1500                 , fLeafCount
1501                 , fMemoryManager
1502             );
1503         }
1504          else if (curType == ContentSpecNode::ZeroOrMore
1505                || curType == ContentSpecNode::ZeroOrOne
1506                || curType == ContentSpecNode::OneOrMore)
1507         {
1508             CMNode* newChild = buildSyntaxTree(leftNode, curIndex);
1509             if (curType == ContentSpecNode::ZeroOrMore
1510                || curType == ContentSpecNode::OneOrMore)
1511             {
1512                 //
1513                 //  Now handle our level. We use our own first and last position
1514                 //  sets, so get them up front.
1515                 //
1516                 const CMStateSet& first = newChild->getFirstPos();
1517                 const CMStateSet& last  = newChild->getLastPos();
1518 
1519                 //
1520                 //  For every position which is in our last position set, add all
1521                 //  of our first position states to the follow set for that
1522                 //  position.
1523                 //
1524                 CMStateSetEnumerator enumLast(&last);
1525                 while(enumLast.hasMoreElements())
1526                 {
1527                     XMLSize_t index=enumLast.nextElement();
1528                     *fFollowList[index] |= first;
1529                 }
1530             }
1531             // This one is fine as is, just change to our form
1532             retNode = new (fMemoryManager) CMUnaryOp
1533             (
1534                 curType
1535                 , newChild
1536                 , fLeafCount
1537                 , fMemoryManager
1538             );
1539         }
1540          else
1541         {
1542             ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::CM_UnknownCMSpecType, fMemoryManager);
1543         }
1544     }
1545     // fault in the first and last pos, then delete it children
1546     retNode->getFirstPos();
1547     retNode->getLastPos();
1548     retNode->orphanChild();
1549     return retNode;
1550 }
1551 
1552 //
1553 //  gInvalidTrans is used to represent bad transitions in the transition table
1554 //  entry for each state. So each entry is initialized to that value. This
1555 //  method creates a new entry and initializes it.
1556 //
makeDefStateList() const1557 unsigned int* DFAContentModel::makeDefStateList() const
1558 {
1559     unsigned int* retArray = (unsigned int*) fMemoryManager->allocate
1560     (
1561         fElemMapSize * sizeof(unsigned int)
1562     ); //new unsigned int[fElemMapSize];
1563     for (unsigned int index = 0; index < fElemMapSize; index++)
1564         retArray[index] = XMLContentModel::gInvalidTrans;
1565     return retArray;
1566 }
1567 
getContentLeafNameTypeVector() const1568 ContentLeafNameTypeVector* DFAContentModel::getContentLeafNameTypeVector() const
1569 {
1570    //later change it to return the data member
1571     return fLeafNameTypeVector;
1572 }
1573 
checkUniqueParticleAttribution(SchemaGrammar * const pGrammar,GrammarResolver * const pGrammarResolver,XMLStringPool * const pStringPool,XMLValidator * const pValidator,unsigned int * const pContentSpecOrgURI,const XMLCh * pComplexTypeName)1574 void DFAContentModel::checkUniqueParticleAttribution (SchemaGrammar*    const pGrammar,
1575                                                       GrammarResolver*  const pGrammarResolver,
1576                                                       XMLStringPool*    const pStringPool,
1577                                                       XMLValidator*     const pValidator,
1578                                                       unsigned int*     const pContentSpecOrgURI,
1579                                                       const XMLCh*            pComplexTypeName /*= 0*/)
1580 {
1581 
1582     SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool);
1583 
1584     unsigned int i, j, k;
1585 
1586     // Rename the URI back
1587     for (i = 0; i < fElemMapSize; i++) {
1588 
1589         unsigned int orgURIIndex = fElemMap[i]->getURI();
1590 
1591         if ((orgURIIndex != XMLContentModel::gEOCFakeId) &&
1592             (orgURIIndex != XMLContentModel::gEpsilonFakeId) &&
1593             (orgURIIndex != XMLElementDecl::fgInvalidElemId) &&
1594             (orgURIIndex != XMLElementDecl::fgPCDataElemId)) {
1595             fElemMap[i]->setURI(pContentSpecOrgURI[orgURIIndex]);
1596         }
1597     }
1598 
1599     // Unique Particle Attribution
1600     // Store the conflict results between any two elements in fElemMap
1601     // 0 - not yet tested, 1 - conflict, (-1) - no conflict
1602     signed char** conflictTable = (signed char**) fMemoryManager->allocate
1603     (
1604         fElemMapSize * sizeof(signed char*)
1605     );
1606 
1607     // initialize the conflict table
1608     for (j = 0; j < fElemMapSize; j++) {
1609         conflictTable[j] = (signed char*) fMemoryManager->allocate
1610         (
1611             fElemMapSize * sizeof(signed char)
1612         );
1613         memset(conflictTable[j], 0, fElemMapSize*sizeof(signed char));
1614     }
1615 
1616     // for each state, check whether it has overlap transitions
1617     for (i = 0; i < fTransTableSize; i++) {
1618         for (j = 0; j < fElemMapSize; j++) {
1619             for (k = j+1; k < fElemMapSize; k++) {
1620                 if (fTransTable[i][j] != XMLContentModel::gInvalidTrans &&
1621                     fTransTable[i][k] != XMLContentModel::gInvalidTrans &&
1622                     conflictTable[j][k] == 0) {
1623 
1624                     // If this is text in a Schema mixed content model, skip it.
1625                     if ( fIsMixed &&
1626                          (( fElemMap[j]->getURI() == XMLElementDecl::fgPCDataElemId) ||
1627                           ( fElemMap[k]->getURI() == XMLElementDecl::fgPCDataElemId)))
1628                         continue;
1629 
1630                     if (XercesElementWildcard::conflict(pGrammar,
1631                                                         fElemMapType[j],
1632                                                         fElemMap[j],
1633                                                         fElemMapType[k],
1634                                                         fElemMap[k],
1635                                                         &comparator)) {
1636                         if (fCountingStates != 0) {
1637                             Occurence* o = fCountingStates[i];
1638                             // If "i" is a counting state and exactly one of the transitions
1639                             // loops back to "i" then the two particles do not overlap if
1640                             // minOccurs == maxOccurs.
1641                             if (o != 0 &&
1642                                 ((fTransTable[i][j] == i) ^ (fTransTable[i][k] == i)) &&
1643                                 o->minOccurs == o->maxOccurs) {
1644                                 conflictTable[j][k] = -1;
1645                                 continue;
1646                             }
1647                         }
1648                        conflictTable[j][k] = 1;
1649 
1650                        XMLBuffer buf1(1023, fMemoryManager);
1651                        if (((fElemMapType[j] & 0x0f) == ContentSpecNode::Any) ||
1652                            ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_NS))
1653                            buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY);
1654                        else if ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_Other)
1655                            buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER);
1656                        else
1657                            buf1.set(fElemMap[j]->getRawName());
1658 
1659                        XMLBuffer buf2(1023, fMemoryManager);
1660                        if (((fElemMapType[k] & 0x0f) == ContentSpecNode::Any) ||
1661                            ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_NS))
1662                            buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY);
1663                        else if ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_Other)
1664                            buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER);
1665                        else
1666                            buf2.set(fElemMap[k]->getRawName());
1667 
1668                        pValidator->emitError(XMLValid::UniqueParticleAttributionFail,
1669                                              pComplexTypeName,
1670                                              buf1.getRawBuffer(),
1671                                              buf2.getRawBuffer());
1672                     }
1673                     else
1674                         conflictTable[j][k] = -1;
1675                 }
1676             }
1677         }
1678     }
1679 
1680     for (i = 0; i < fElemMapSize; i++)
1681         fMemoryManager->deallocate(conflictTable[i]);
1682     fMemoryManager->deallocate(conflictTable);
1683 }
1684 
1685 XERCES_CPP_NAMESPACE_END
1686