GME  13
DFAContentModel.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  *
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: DFAContentModel.cpp 901107 2010-01-20 08:45:02Z borisk $
00020  */
00021 
00022 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/util/RuntimeException.hpp>
00027 #include <xercesc/framework/XMLBuffer.hpp>
00028 #include <xercesc/framework/XMLElementDecl.hpp>
00029 #include <xercesc/framework/XMLValidator.hpp>
00030 #include <xercesc/validators/common/CMAny.hpp>
00031 #include <xercesc/validators/common/CMBinaryOp.hpp>
00032 #include <xercesc/validators/common/CMLeaf.hpp>
00033 #include <xercesc/validators/common/CMRepeatingLeaf.hpp>
00034 #include <xercesc/validators/common/CMUnaryOp.hpp>
00035 #include <xercesc/validators/common/DFAContentModel.hpp>
00036 #include <xercesc/validators/common/ContentSpecNode.hpp>
00037 #include <xercesc/validators/common/Grammar.hpp>
00038 #include <xercesc/validators/schema/SchemaSymbols.hpp>
00039 #include <xercesc/validators/schema/SubstitutionGroupComparator.hpp>
00040 #include <xercesc/validators/schema/XercesElementWildcard.hpp>
00041 #include <xercesc/util/RefHashTableOf.hpp>
00042 #include <xercesc/util/XMLInteger.hpp>
00043 #include <math.h>
00044 
00045 XERCES_CPP_NAMESPACE_BEGIN
00046 
00047 struct CMStateSetHasher
00048 {
00049   XMLSize_t getHashVal(const void *const key, XMLSize_t mod)
00050   {
00051     const CMStateSet* const pkey = (const CMStateSet*) key;
00052     return ((pkey->hashCode()) % mod);
00053   }
00054 
00055   bool equals(const void *const key1, const void *const key2)
00056   {
00057     const CMStateSet* const pkey1 = (const CMStateSet*) key1;
00058     const CMStateSet* const pkey2 = (const CMStateSet*) key2;
00059     return (*pkey1==*pkey2);
00060   }
00061 };
00062 
00063 // ---------------------------------------------------------------------------
00064 //  DFAContentModel: Constructors and Destructor
00065 // ---------------------------------------------------------------------------
00066 DFAContentModel::DFAContentModel( const bool             dtd
00067                                 , ContentSpecNode* const elemContentSpec
00068                                 , MemoryManager* const   manager) :
00069 
00070     fElemMap(0)
00071     , fElemMapType(0)
00072     , fElemMapSize(0)
00073     , fEmptyOk(false)
00074     , fEOCPos(0)
00075     , fFinalStateFlags(0)
00076     , fFollowList(0)
00077     , fHeadNode(0)
00078     , fLeafCount(0)
00079     , fLeafList(0)
00080     , fLeafListType(0)
00081     , fTransTable(0)
00082     , fTransTableSize(0)
00083     , fCountingStates(0)
00084     , fDTD(dtd)
00085     , fIsMixed(false)
00086     , fLeafNameTypeVector(0)
00087     , fMemoryManager(manager)
00088 {
00089     // And build the DFA data structures
00090     buildDFA(elemContentSpec);
00091 }
00092 
00093 DFAContentModel::DFAContentModel( const bool             dtd
00094                                 , ContentSpecNode* const elemContentSpec
00095                                 , const bool             isMixed
00096                                 , MemoryManager* const   manager):
00097 
00098     fElemMap(0)
00099     , fElemMapType(0)
00100     , fElemMapSize(0)
00101     , fEmptyOk(false)
00102     , fEOCPos(0)
00103     , fFinalStateFlags(0)
00104     , fFollowList(0)
00105     , fHeadNode(0)
00106     , fLeafCount(0)
00107     , fLeafList(0)
00108     , fLeafListType(0)
00109     , fTransTable(0)
00110     , fTransTableSize(0)
00111     , fCountingStates(0)
00112     , fDTD(dtd)
00113     , fIsMixed(isMixed)
00114     , fLeafNameTypeVector(0)
00115     , fMemoryManager(manager)
00116 {
00117     // And build the DFA data structures
00118     buildDFA(elemContentSpec);
00119 }
00120 
00121 DFAContentModel::~DFAContentModel()
00122 {
00123     //
00124     //  Clean up all the stuff that is not just temporary representation
00125     //  data that was cleaned up after building the DFA.
00126     //
00127     fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags;
00128 
00129     unsigned int index;
00130     for (index = 0; index < fTransTableSize; index++)
00131         fMemoryManager->deallocate(fTransTable[index]); //delete [] fTransTable[index];
00132     fMemoryManager->deallocate(fTransTable); //delete [] fTransTable;
00133 
00134     if(fCountingStates)
00135     {
00136         for (unsigned int j = 0; j < fTransTableSize; ++j)
00137             delete fCountingStates[j];
00138         fMemoryManager->deallocate(fCountingStates);
00139     }
00140 
00141     for (index = 0; index < fLeafCount; index++)
00142         delete fElemMap[index];
00143     fMemoryManager->deallocate(fElemMap); //delete [] fElemMap;
00144 
00145     fMemoryManager->deallocate(fElemMapType); //delete [] fElemMapType;
00146     fMemoryManager->deallocate(fLeafListType); //delete [] fLeafListType;
00147 
00148     delete fLeafNameTypeVector;
00149 }
00150 
00151 
00152 // ---------------------------------------------------------------------------
00153 //  DFAContentModel: Implementation of the ContentModel virtual interface
00154 // ---------------------------------------------------------------------------
00155 bool
00156 DFAContentModel::validateContent( QName** const        children
00157                                 , XMLSize_t            childCount
00158                                 , unsigned int
00159                                 , XMLSize_t*           indexFailingChild
00160                                 , MemoryManager*    const) const
00161 {
00162     //
00163     //  If there are no children, then either we fail on the 0th element
00164     //  or we return success. It depends upon whether this content model
00165     //  accepts empty content, which we determined earlier.
00166     //
00167     if (!childCount)
00168     {
00169         // success
00170         if(fEmptyOk)
00171             return true;
00172         *indexFailingChild=0;
00173         return false;
00174     }
00175 
00176     //
00177     //  Lets loop through the children in the array and move our way
00178     //  through the states. Note that we use the fElemMap array to map
00179     //  an element index to a state index.
00180     //
00181     unsigned int curState = 0;
00182     unsigned int nextState = 0;
00183     unsigned int loopCount = 0;
00184     unsigned int childIndex = 0;
00185     for (; childIndex < childCount; childIndex++)
00186     {
00187         // Get the current element index out
00188         const QName* curElem = children[childIndex];
00189         const XMLCh* curElemRawName = 0;
00190         if (fDTD)
00191             curElemRawName = curElem->getRawName();
00192 
00193         // If this is text in a Schema mixed content model, skip it.
00194         if ( fIsMixed &&
00195             ( curElem->getURI() == XMLElementDecl::fgPCDataElemId))
00196             continue;
00197 
00198         // Look up this child in our element map
00199         unsigned int elemIndex = 0;
00200         for (; elemIndex < fElemMapSize; elemIndex++)
00201         {
00202             const QName* inElem  = fElemMap[elemIndex];
00203             if (fDTD) {
00204                 if (XMLString::equals(inElem->getRawName(), curElemRawName)) {
00205                     nextState = fTransTable[curState][elemIndex];
00206                     if (nextState != XMLContentModel::gInvalidTrans)
00207                         break;
00208                 }
00209             }
00210             else {
00211                 ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
00212                 if (type == ContentSpecNode::Leaf)
00213                 {
00214                     if ((inElem->getURI() == curElem->getURI()) &&
00215                     (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) {
00216                         nextState = fTransTable[curState][elemIndex];
00217                         if (nextState != XMLContentModel::gInvalidTrans)
00218                             break;
00219                     }
00220                 }
00221                 else if ((type & 0x0f)== ContentSpecNode::Any)
00222                 {
00223                     nextState = fTransTable[curState][elemIndex];
00224                     if (nextState != XMLContentModel::gInvalidTrans)
00225                         break;
00226                 }
00227                 else if ((type & 0x0f) == ContentSpecNode::Any_NS)
00228                 {
00229                     if (inElem->getURI() == curElem->getURI())
00230                     {
00231                         nextState = fTransTable[curState][elemIndex];
00232                         if (nextState != XMLContentModel::gInvalidTrans)
00233                             break;
00234                     }
00235                 }
00236                 else if ((type & 0x0f) == ContentSpecNode::Any_Other)
00237                 {
00238                     // Here we assume that empty string has id 1.
00239                     //
00240                     unsigned int uriId = curElem->getURI();
00241                     if (uriId != 1 && uriId != inElem->getURI()) {
00242                         nextState = fTransTable[curState][elemIndex];
00243                         if (nextState != XMLContentModel::gInvalidTrans)
00244                             break;
00245                     }
00246                 }
00247             }
00248         }//for elemIndex
00249 
00250         // If "nextState" is -1, we found a match, but the transition is invalid
00251         if (nextState == XMLContentModel::gInvalidTrans)
00252         {
00253             *indexFailingChild=childIndex;
00254             return false;
00255         }
00256 
00257         // If we didn't find it, then obviously not valid
00258         if (elemIndex == fElemMapSize)
00259         {
00260             *indexFailingChild=childIndex;
00261             return false;
00262         }
00263 
00264         unsigned int nextLoop = 0;
00265         if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, 0))
00266         {
00267             *indexFailingChild=childIndex;
00268             return false;
00269         }
00270 
00271         curState = nextState;
00272         loopCount = nextLoop;
00273         nextState = 0;
00274 
00275     }//for childIndex
00276 
00277     //
00278     //  We transitioned all the way through the input list. However, that
00279     //  does not mean that we ended in a final state. So check whether
00280     //  our ending state is a final state.
00281     //
00282     if (!fFinalStateFlags[curState])
00283     {
00284         *indexFailingChild=childIndex;
00285         return false;
00286     }
00287 
00288     // verify if we exited before the minOccurs was satisfied
00289     if (fCountingStates != 0) {
00290         Occurence* o = fCountingStates[curState];
00291         if (o != 0 && loopCount < (unsigned int)o->minOccurs) {
00292             // not enough loops on the current state to be considered final.
00293             *indexFailingChild=childIndex;
00294             return false;
00295         }
00296     }
00297 
00298     //success
00299     return true;
00300 }
00301 
00302 bool DFAContentModel::validateContentSpecial(QName** const            children
00303                                             , XMLSize_t               childCount
00304                                             , unsigned int
00305                                             , GrammarResolver*  const pGrammarResolver
00306                                             , XMLStringPool*    const pStringPool
00307                                             , XMLSize_t*              indexFailingChild
00308                                             , MemoryManager*    const) const
00309 {
00310 
00311     SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool);
00312 
00313     if (childCount == 0)
00314     {
00315         if(fEmptyOk)
00316             return true;
00317         *indexFailingChild=0;
00318         return false;
00319     }
00320 
00321     //
00322     //  Lets loop through the children in the array and move our way
00323     //  through the states. Note that we use the fElemMap array to map
00324     //  an element index to a state index.
00325     //
00326     unsigned int curState = 0;
00327     unsigned int loopCount = 0;
00328     unsigned int nextState = 0;
00329     unsigned int childIndex = 0;
00330     for (; childIndex < childCount; childIndex++)
00331     {
00332         // Get the current element index out
00333         QName* curElem = children[childIndex];
00334 
00335         // If this is text in a Schema mixed content model, skip it.
00336         if ( fIsMixed &&
00337             ( curElem->getURI() == XMLElementDecl::fgPCDataElemId))
00338             continue;
00339 
00340         // Look up this child in our element map
00341         unsigned int elemIndex = 0;
00342         for (; elemIndex < fElemMapSize; elemIndex++)
00343         {
00344             QName* inElem  = fElemMap[elemIndex];
00345             ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
00346             if (type == ContentSpecNode::Leaf)
00347             {
00348                 if (comparator.isEquivalentTo(curElem, inElem) )
00349                 {
00350                     nextState = fTransTable[curState][elemIndex];
00351                     if (nextState != XMLContentModel::gInvalidTrans)
00352                         break;
00353                 }
00354 
00355             }
00356             else if ((type & 0x0f)== ContentSpecNode::Any)
00357             {
00358                 nextState = fTransTable[curState][elemIndex];
00359                 if (nextState != XMLContentModel::gInvalidTrans)
00360                     break;
00361             }
00362             else if ((type & 0x0f) == ContentSpecNode::Any_NS)
00363             {
00364                 if (inElem->getURI() == curElem->getURI())
00365                 {
00366                     nextState = fTransTable[curState][elemIndex];
00367                     if (nextState != XMLContentModel::gInvalidTrans)
00368                         break;
00369                 }
00370             }
00371             else if ((type & 0x0f) == ContentSpecNode::Any_Other)
00372             {
00373                 // Here we assume that empty string has id 1.
00374                 //
00375                 unsigned int uriId = curElem->getURI();
00376                 if (uriId != 1 && uriId != inElem->getURI())
00377                 {
00378                     nextState = fTransTable[curState][elemIndex];
00379                     if (nextState != XMLContentModel::gInvalidTrans)
00380                         break;
00381                 }
00382             }
00383         }//for elemIndex
00384 
00385         // If "nextState" is -1, we found a match, but the transition is invalid
00386         if (nextState == XMLContentModel::gInvalidTrans)
00387         {
00388             *indexFailingChild=childIndex;
00389             return false;
00390         }
00391 
00392         // If we didn't find it, then obviously not valid
00393         if (elemIndex == fElemMapSize)
00394         {
00395             *indexFailingChild=childIndex;
00396             return false;
00397         }
00398 
00399         unsigned int nextLoop = 0;
00400         if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, &comparator))
00401         {
00402             *indexFailingChild=childIndex;
00403             return false;
00404         }
00405 
00406         curState = nextState;
00407         loopCount = nextLoop;
00408         nextState = 0;
00409 
00410     }//for childIndex
00411 
00412     //
00413     //  We transitioned all the way through the input list. However, that
00414     //  does not mean that we ended in a final state. So check whether
00415     //  our ending state is a final state.
00416     //
00417     if (!fFinalStateFlags[curState])
00418     {
00419         *indexFailingChild=childIndex;
00420         return false;
00421     }
00422 
00423     // verify if we exited before the minOccurs was satisfied
00424     if (fCountingStates != 0) {
00425         Occurence* o = fCountingStates[curState];
00426         if (o != 0) {
00427             if (loopCount < (unsigned int)o->minOccurs) {
00428                 // not enough loops on the current state.
00429                 *indexFailingChild=childIndex;
00430                 return false;
00431             }
00432         }
00433     }
00434 
00435     //success
00436     return true;
00437 }
00438 
00439 bool DFAContentModel::handleRepetitions(const QName* const curElem,
00440                                         unsigned int curState,
00441                                         unsigned int currentLoop,
00442                                         unsigned int& nextState,
00443                                         unsigned int& nextLoop,
00444                                         XMLSize_t elemIndex,
00445                                         SubstitutionGroupComparator * comparator) const
00446 {
00447     nextLoop = 0;
00448     if (fCountingStates != 0) {
00449         nextLoop = currentLoop;
00450         Occurence* o = fCountingStates[curState];
00451         if (o != 0) {
00452             if (curState == nextState) {
00453                 if (++nextLoop > (unsigned int)o->maxOccurs && o->maxOccurs != -1) {
00454                     // It's likely that we looped too many times on the current state
00455                     // however it's possible that we actually matched another particle
00456                     // which allows the same name.
00457                     //
00458                     // Consider:
00459                     //
00460                     // <xs:sequence>
00461                     //  <xs:element name="foo" type="xs:string" minOccurs="3" maxOccurs="3"/>
00462                     //  <xs:element name="foo" type="xs:string" fixed="bar"/>
00463                     // </xs:sequence>
00464                     //
00465                     // and
00466                     //
00467                     // <xs:sequence>
00468                     //  <xs:element name="foo" type="xs:string" minOccurs="3" maxOccurs="3"/>
00469                     //  <xs:any namespace="##any" processContents="skip"/>
00470                     // </xs:sequence>
00471                     //
00472                     // In the DFA there will be two transitions from the current state which
00473                     // allow "foo". Note that this is not a UPA violation. The ambiguity of which
00474                     // transition to take is resolved by the current value of the counter. Since
00475                     // we've already seen enough instances of the first "foo" perhaps there is
00476                     // another element declaration or wildcard deeper in the element map which
00477                     // matches.
00478                     unsigned int tempNextState = 0;
00479 
00480                     while (++elemIndex < fElemMapSize) {
00481                         QName* inElem  = fElemMap[elemIndex];
00482                         ContentSpecNode::NodeTypes type = fElemMapType[elemIndex];
00483                         if (type == ContentSpecNode::Leaf)
00484                         {
00485                             if(comparator!=0) {
00486                                 if (comparator->isEquivalentTo(curElem, inElem) )
00487                                 {
00488                                     tempNextState = fTransTable[curState][elemIndex];
00489                                     if (tempNextState != XMLContentModel::gInvalidTrans)
00490                                         break;
00491                                 }
00492                             }
00493                             else if (fDTD) {
00494                                 if (XMLString::equals(inElem->getRawName(), curElem->getRawName())) {
00495                                     tempNextState = fTransTable[curState][elemIndex];
00496                                     if (tempNextState != XMLContentModel::gInvalidTrans)
00497                                         break;
00498                                 }
00499                             }
00500                             else {
00501                                 if ((inElem->getURI() == curElem->getURI()) &&
00502                                 (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) {
00503                                     tempNextState = fTransTable[curState][elemIndex];
00504                                     if (tempNextState != XMLContentModel::gInvalidTrans)
00505                                         break;
00506                                 }
00507                             }
00508                         }
00509                         else if ((type & 0x0f)== ContentSpecNode::Any)
00510                         {
00511                             tempNextState = fTransTable[curState][elemIndex];
00512                             if (tempNextState != XMLContentModel::gInvalidTrans)
00513                                 break;
00514                         }
00515                         else if ((type & 0x0f) == ContentSpecNode::Any_NS)
00516                         {
00517                             if (inElem->getURI() == curElem->getURI())
00518                             {
00519                                 tempNextState = fTransTable[curState][elemIndex];
00520                                 if (tempNextState != XMLContentModel::gInvalidTrans)
00521                                     break;
00522                             }
00523                         }
00524                         else if ((type & 0x0f) == ContentSpecNode::Any_Other)
00525                         {
00526                             // Here we assume that empty string has id 1.
00527                             //
00528                             unsigned int uriId = curElem->getURI();
00529                             if (uriId != 1 && uriId != inElem->getURI())
00530                             {
00531                                 tempNextState = fTransTable[curState][elemIndex];
00532                                 if (tempNextState != XMLContentModel::gInvalidTrans)
00533                                     break;
00534                             }
00535                         }
00536                     }
00537 
00538                     // if we still can't find a match, report the error
00539                     if (elemIndex == fElemMapSize)
00540                         return false;
00541 
00542                     // if we found a match, set the next state and reset the
00543                     // counter if the next state is a counting state.
00544                     nextState = tempNextState;
00545                     Occurence* o = fCountingStates[nextState];
00546                     if (o != 0) {
00547                       nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
00548                     }
00549                 }
00550             }
00551             else if (nextLoop < (unsigned int)o->minOccurs) {
00552                 // not enough loops on the current state.
00553                 return false;
00554             }
00555             else {
00556                 // Exiting a counting state. If we're entering a new
00557                 // counting state, reset the counter.
00558                 o = fCountingStates[nextState];
00559                 if (o != 0) {
00560                   nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
00561                 }
00562             }
00563         }
00564         else {
00565             o = fCountingStates[nextState];
00566             if (o != 0) {
00567                 // Entering a new counting state. Reset the counter.
00568                 // If we've already seen one instance of the looping
00569                 // particle set the counter to 1, otherwise set it
00570                 // to 0.
00571               nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0;
00572             }
00573         }
00574     }
00575     return true;
00576 }
00577 
00578 // ---------------------------------------------------------------------------
00579 //  DFAContentModel: Private helper methods
00580 // ---------------------------------------------------------------------------
00581 void DFAContentModel::buildDFA(ContentSpecNode* const curNode)
00582 {
00583     unsigned int index;
00584 
00585     //
00586     //  The first step we need to take is to rewrite the content model using
00587     //  our CMNode objects, and in the process get rid of any repetition short
00588     //  cuts, converting them into '*' style repetitions or getting rid of
00589     //  repetitions altogether.
00590     //
00591     //  The conversions done are:
00592     //
00593     //  x+ -> (x|x*)
00594     //  x? -> (x|epsilon)
00595     //
00596     //  This is a relatively complex scenario. What is happening is that we
00597     //  create a top level binary node of which the special EOC value is set
00598     //  as the right side node. The the left side is set to the rewritten
00599     //  syntax tree. The source is the original content model info from the
00600     //  decl pool. The rewrite is done by buildSyntaxTree() which recurses the
00601     //  decl pool's content of the element and builds a new tree in the
00602     //  process.
00603     //
00604     //  Note that, during this operation, we set each non-epsilon leaf node's
00605     //  DFA state position and count the number of such leafs, which is left
00606     //  in the fLeafCount member.
00607     //
00608     fLeafCount=countLeafNodes(curNode);
00609     fEOCPos = fLeafCount++;
00610 
00611     //  We need to build an array of references to the non-epsilon
00612     //  leaf nodes. We will put them in the array according to their position values
00613     //
00614     fLeafList = (CMLeaf**) fMemoryManager->allocate(fLeafCount*sizeof(CMLeaf*)); //new CMLeaf*[fLeafCount];
00615     fLeafListType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
00616     (
00617         fLeafCount * sizeof(ContentSpecNode::NodeTypes)
00618     ); //new ContentSpecNode::NodeTypes[fLeafCount];
00619     //
00620     //  And, moving onward... We now need to build the follow position sets
00621     //  for all the nodes. So we allocate an array of pointers to state sets,
00622     //  one for each leaf node (i.e. each significant DFA position.)
00623     //
00624     fFollowList = (CMStateSet**) fMemoryManager->allocate
00625     (
00626         fLeafCount * sizeof(CMStateSet*)
00627     ); //new CMStateSet*[fLeafCount];
00628     for (index = 0; index < fLeafCount; index++)
00629         fFollowList[index] = new (fMemoryManager) CMStateSet(fLeafCount, fMemoryManager);
00630 
00631     //  The buildSyntaxTree function will recursively iterate over the ContentSpecNode
00632     //  and build the CMNode hierarchy; it will also put every leaf node in the fLeafList
00633     //  array, then calculate the first and last position sets of each node. This is
00634     //  cached away in each of the nodes.
00635     //
00636     //  Along the way we also set the leaf count in each node as the maximum
00637     //  state count. They must know this in order to create their first/last
00638     //  position sets.
00639     //
00640     unsigned int counter=0;
00641     CMNode* nodeOrgContent = buildSyntaxTree(curNode, counter);
00642     //
00643     //  Check to see whether this content model can handle an empty content,
00644     //  which is something we need to optimize by looking now before we
00645     //  throw away the info that would tell us that.
00646     //
00647     //  If the left node of the head (the top level of the original content)
00648     //  is nullable, then its true.
00649     //
00650     fEmptyOk = nodeOrgContent->isNullable();
00651 
00652     //
00653     //  And handle specially the EOC node, which also must be numbered and
00654     //  counted as a non-epsilon leaf node. It could not be handled in the
00655     //  above tree build because it was created before all that started. We
00656     //  save the EOC position since its used during the DFA building loop.
00657     //
00658     CMLeaf* nodeEOC = new (fMemoryManager) CMLeaf
00659     (
00660         new (fMemoryManager) QName
00661         (
00662             XMLUni::fgZeroLenString
00663             , XMLUni::fgZeroLenString
00664             , XMLContentModel::gEOCFakeId
00665             , fMemoryManager
00666         )
00667         , fEOCPos
00668         , true
00669         , fLeafCount
00670         , fMemoryManager
00671     );
00672     fHeadNode = new (fMemoryManager) CMBinaryOp
00673     (
00674         ContentSpecNode::Sequence
00675         , nodeOrgContent
00676         , nodeEOC
00677         , fLeafCount
00678         , fMemoryManager
00679     );
00680 
00681     //  Put also the final EOC node in the leaf array
00682     fLeafList[counter] = new (fMemoryManager) CMLeaf
00683     (
00684         nodeEOC->getElement()
00685         , nodeEOC->getPosition()
00686         , fLeafCount
00687         , fMemoryManager
00688     );
00689     fLeafListType[counter] = ContentSpecNode::Leaf;
00690 
00691     //
00692     //  Now handle our top level. We use our left child's last pos set and our
00693     //  right child's first pos set, so get them now for convenience.
00694     //
00695     const CMStateSet& last  = nodeOrgContent->getLastPos();
00696     const CMStateSet& first = nodeEOC->getFirstPos();
00697 
00698     //
00699     //  Now, for every position which is in our left child's last set
00700     //  add all of the states in our right child's first set to the
00701     //  follow set for that position.
00702     //
00703     CMStateSetEnumerator enumLast(&last);
00704     while(enumLast.hasMoreElements())
00705     {
00706         XMLSize_t index=enumLast.nextElement();
00707         *fFollowList[index] |= first;
00708     }
00709 
00710     //
00711     //  And finally the big push... Now we build the DFA using all the states
00712     //  and the tree we've built up. First we set up the various data
00713     //  structures we are going to use while we do this.
00714     //
00715     //  First of all we need an array of unique element ids in our content
00716     //  model. For each transition table entry, we need a set of contiguous
00717     //  indices to represent the transitions for a particular input element.
00718     //  So we need to a zero based range of indexes that map to element types.
00719     //  This element map provides that mapping.
00720     //
00721     fElemMap = (QName**) fMemoryManager->allocate
00722     (
00723         fLeafCount * sizeof(QName*)
00724     ); //new QName*[fLeafCount];
00725     fElemMapType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
00726     (
00727         fLeafCount * sizeof(ContentSpecNode::NodeTypes)
00728     ); //new ContentSpecNode::NodeTypes[fLeafCount];
00729     fElemMapSize = 0;
00730 
00731     Occurence** elemOccurenceMap=0;
00732     for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++)
00733     {
00734         fElemMap[outIndex] = new (fMemoryManager) QName(fMemoryManager);
00735 
00736         if ( (fLeafListType[outIndex] & 0x0f) != ContentSpecNode::Leaf )
00737             if (!fLeafNameTypeVector)
00738                 fLeafNameTypeVector = new (fMemoryManager) ContentLeafNameTypeVector(fMemoryManager);
00739 
00740         // Get the current leaf's element index
00741         CMLeaf* leaf=fLeafList[outIndex];
00742         const QName* element = leaf->getElement();
00743         const XMLCh* elementRawName = 0;
00744         if (fDTD && element)
00745             elementRawName = element->getRawName();
00746 
00747         // See if the current leaf node's element index is in the list
00748         unsigned int inIndex = 0;
00749 
00750         for (; inIndex < fElemMapSize; inIndex++)
00751         {
00752             const QName* inElem = fElemMap[inIndex];
00753             if (fDTD) {
00754                 if (XMLString::equals(inElem->getRawName(), elementRawName)) {
00755                     break;
00756                 }
00757             }
00758             else {
00759                 if ((fElemMapType[inIndex] == fLeafListType[outIndex]) &&
00760                     (inElem->getURI() == element->getURI()) &&
00761                     (XMLString::equals(inElem->getLocalPart(), element->getLocalPart()))) {
00762                     break;
00763                 }
00764             }
00765         }
00766 
00767         // If it was not in the list, then add it and bump the map size
00768         if (inIndex == fElemMapSize)
00769         {
00770             fElemMap[fElemMapSize]->setValues(*element);
00771             if(leaf->isRepeatableLeaf())
00772             {
00773                 if (elemOccurenceMap == 0) {
00774                     elemOccurenceMap = (Occurence**)fMemoryManager->allocate(fLeafCount*sizeof(Occurence*));
00775                     memset(elemOccurenceMap, 0, fLeafCount*sizeof(Occurence*));
00776                 }
00777                 elemOccurenceMap[fElemMapSize] = new (fMemoryManager) Occurence(((CMRepeatingLeaf*)leaf)->getMinOccurs(), ((CMRepeatingLeaf*)leaf)->getMaxOccurs(), fElemMapSize);
00778             }
00779             fElemMapType[fElemMapSize] = fLeafListType[outIndex];
00780             ++fElemMapSize;
00781         }
00782     }
00783 
00784     // set up the fLeafNameTypeVector object if there is one.
00785     if (fLeafNameTypeVector) {
00786         fLeafNameTypeVector->setValues(fElemMap, fElemMapType, fElemMapSize);
00787     }
00788 
00789     /***
00790      * Optimization(Jan, 2001); We sort fLeafList according to
00791      * elemIndex which is *uniquely* associated to each leaf.
00792      * We are *assuming* that each element appears in at least one leaf.
00793      **/
00794     // don't forget to delete it
00795 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
00796     int *fLeafSorter = (int*) fMemoryManager->allocate
00797     (
00798         (fLeafCount + fElemMapSize) * sizeof(int)
00799     ); //new int[fLeafCount + fElemMapSize];
00800     unsigned int fSortCount = 0;
00801 
00802     for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
00803     {
00804         const QName* element = fElemMap[elemIndex];
00805         const XMLCh* elementRawName = 0;
00806         if (fDTD && element)
00807             elementRawName = element->getRawName();
00808 
00809         for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
00810         {
00811             const QName* leaf = fLeafList[leafIndex]->getElement();
00812             if (fDTD) {
00813                 if (XMLString::equals(leaf->getRawName(), elementRawName)) {
00814                     fLeafSorter[fSortCount++] = leafIndex;
00815                 }
00816             }
00817             else {
00818                 if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
00819                     (leaf->getURI() == element->getURI()) &&
00820                     (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
00821                       fLeafSorter[fSortCount++] = leafIndex;
00822                 }
00823             }
00824         }
00825         fLeafSorter[fSortCount++] = -1;
00826     }
00827 #endif
00828 
00829     // instead of using a single array with -1 to separate elements, use a bidimensional map
00830     unsigned int** fLeafSorter = (unsigned int**)fMemoryManager->allocate(fElemMapSize * sizeof(unsigned int*));
00831     unsigned int* tmpSorter = (unsigned int*)fMemoryManager->allocate(fLeafCount * sizeof(unsigned int));
00832     for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
00833     {
00834         const QName* element = fElemMap[elemIndex];
00835         const XMLCh* elementRawName = 0;
00836         if (fDTD && element)
00837             elementRawName = element->getRawName();
00838 
00839         unsigned int fSortCount=0;
00840         for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
00841         {
00842             const QName* leaf = fLeafList[leafIndex]->getElement();
00843             if (fDTD) {
00844                 if (XMLString::equals(leaf->getRawName(), elementRawName)) {
00845                     tmpSorter[fSortCount++] = leafIndex;
00846                 }
00847             }
00848             else {
00849                 if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
00850                     (leaf->getURI() == element->getURI()) &&
00851                     (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
00852                       tmpSorter[fSortCount++] = leafIndex;
00853                 }
00854             }
00855         }
00856 
00857         fLeafSorter[elemIndex]=(unsigned int*)fMemoryManager->allocate((fSortCount+1) * sizeof(unsigned int));
00858         fLeafSorter[elemIndex][0]=fSortCount;
00859         for (unsigned int index=0;index<fSortCount;index++)
00860             fLeafSorter[elemIndex][index+1]=tmpSorter[index];
00861     }
00862     fMemoryManager->deallocate(tmpSorter);
00863 
00864     //
00865     //  Next lets create some arrays, some that that hold transient info
00866     //  during the DFA build and some that are permament. These are kind of
00867     //  sticky since we cannot know how big they will get, but we don't want
00868     //  to use any collection type classes because of performance.
00869     //
00870     //  Basically they will probably be about fLeafCount*2 on average, but can
00871     //  be as large as 2^(fLeafCount*2), worst case. So we start with
00872     //  fLeafCount*4 as a middle ground. This will be very unlikely to ever
00873     //  have to expand though, it if does, the overhead will be somewhat ugly.
00874     //
00875     unsigned int curArraySize = fLeafCount * 4;
00876     CMStateSet** statesToDo = (CMStateSet**)
00877         fMemoryManager->allocate
00878         (
00879             curArraySize * sizeof(CMStateSet*)
00880         ); //new const CMStateSet*[curArraySize];
00881     fFinalStateFlags = (bool*) fMemoryManager->allocate
00882     (
00883         curArraySize * sizeof(bool)
00884     ); //new bool[curArraySize];
00885     fTransTable = (unsigned int**) fMemoryManager->allocate
00886     (
00887         curArraySize * sizeof(unsigned int*)
00888     ); //new unsigned int*[curArraySize];
00889 
00890     //
00891     //  Ok we start with the initial set as the first pos set of the head node
00892     //  (which is the seq node that holds the content model and the EOC node.)
00893     //
00894     CMStateSet* setT = new (fMemoryManager) CMStateSet(fHeadNode->getFirstPos());
00895 
00896     //
00897     // Note on memory leak: Bugzilla#2707:
00898     // ===================================
00899     // The CMBinary, pointed to by fHeadNode, shall be released by
00900     // deleted by itself.
00901     //
00902     // fLeafList[] maintains its **OWN** copy of CMLeaf to avoid double deletion
00903     // of CMLeaf.
00904     //
00905 
00906     delete fHeadNode;
00907 
00908     //
00909     //  Init our two state flags. Basically the unmarked state counter is
00910     //  always chasing the current state counter. When it catches up, that
00911     //  means we made a pass through that did not add any new states to the
00912     //  lists, at which time we are done. We could have used a expanding array
00913     //  of flags which we used to mark off states as we complete them, but
00914     //  this is easier though less readable maybe.
00915     //
00916     unsigned int unmarkedState = 0;
00917     unsigned int curState = 0;
00918 
00919     //
00920     //  Init the first transition table entry, and put the initial state
00921     //  into the states to do list, then bump the current state.
00922     //
00923     fTransTable[curState] = makeDefStateList();
00924     statesToDo[curState] = setT;
00925     curState++;
00926 
00927     //
00928     // the stateTable is an auxiliary means to fast
00929     // identification of new state created (instead
00930     // of sequential loop statesToDo to find out),
00931     // while the role that statesToDo plays remain unchanged.
00932     //
00933     RefHashTableOf<XMLInteger, CMStateSetHasher> *stateTable =
00934         new (fMemoryManager) RefHashTableOf<XMLInteger, CMStateSetHasher>
00935         (
00936             curArraySize
00937             , true
00938             , fMemoryManager
00939         );
00940     //stateTable->put((CMStateSet*)setT, new (fMemoryManager) XMLInteger(0));
00941 
00942     //
00943     //  Ok, almost done with the algorithm from hell... We now enter the
00944     //  loop where we go until the states done counter catches up with
00945     //  the states to do counter.
00946     //
00947     CMStateSet* newSet = 0;
00948     while (unmarkedState < curState)
00949     {
00950         //
00951         //  Get the next unmarked state out of the list of states to do.
00952         //  And get the associated transition table entry.
00953         //
00954         setT = statesToDo[unmarkedState];
00955         unsigned int* transEntry = fTransTable[unmarkedState];
00956 
00957         // Mark this one final if it contains the EOC state
00958         fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos);
00959 
00960         // Bump up the unmarked state count, marking this state done
00961         unmarkedState++;
00962 
00963 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
00964         // Optimization(Jan, 2001)
00965         unsigned int sorterIndex = 0;
00966         // Optimization(Jan, 2001)
00967 #endif
00968 
00969         // Loop through each possible input symbol in the element map
00970         for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
00971         {
00972             //
00973             //  Build up a set of states which is the union of all of the
00974             //  follow sets of DFA positions that are in the current state. If
00975             //  we gave away the new set last time through then create a new
00976             //  one. Otherwise, zero out the existing one.
00977             //
00978             if (!newSet)
00979                 newSet = new (fMemoryManager) CMStateSet
00980                 (
00981                     fLeafCount
00982                     , fMemoryManager
00983                 );
00984             else
00985                 newSet->zeroBits();
00986 
00987 #ifdef OBSOLETED
00988 // unoptimized code
00989             for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
00990             {
00991                 // If this leaf index (DFA position) is in the current set...
00992                 if (setT->getBit(leafIndex))
00993                 {
00994                     //
00995                     //  If this leaf is the current input symbol, then we want
00996                     //  to add its follow list to the set of states to transition
00997                     //  to from the current state.
00998                     //
00999                     const QName* leaf = fLeafList[leafIndex]->getElement();
01000                     const QName* element = fElemMap[elemIndex];
01001                     if (fDTD) {
01002                         if (XMLString::equals(leaf->getRawName(), element->getRawName())) {
01003                             *newSet |= *fFollowList[leafIndex];
01004                         }
01005                     }
01006                     else {
01007                         if ((leaf->getURI() == element->getURI()) &&
01008                             (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
01009                             *newSet |= *fFollowList[leafIndex];
01010                         }
01011                     }
01012                 }
01013             } // for leafIndex
01014 #endif
01015 
01016 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
01017             // Optimization(Jan, 2001)
01018             int leafIndex = fLeafSorter[sorterIndex++];
01019 
01020             while (leafIndex != -1)
01021             {
01022                 // If this leaf index (DFA position) is in the current set...
01023                 if (setT->getBit(leafIndex))
01024                 {
01025                     //
01026                     //  If this leaf is the current input symbol, then we
01027                     //  want to add its follow list to the set of states to
01028                     //  transition to from the current state.
01029                     //
01030                     *newSet |= *fFollowList[leafIndex];
01031                 }
01032                 leafIndex = fLeafSorter[sorterIndex++];
01033             } // while (leafIndex != -1)
01034 #endif
01035 
01036             unsigned int* fLeafIndexes=fLeafSorter[elemIndex];
01037             unsigned int fNumItems=fLeafIndexes[0];
01038             if(fNumItems!=0)
01039             {
01040                 // The algorithm requires finding the leaf that is present both in the bitfield of the current state, and in the
01041                 // list of places where the currently tested item can appear. When this occurs, the follow list of this parent item
01042                 // is added to the bitfield representing the next state.
01043                 // Both the bitfield and the list of places are sorted, so we can analyze them in two ways; either iterating over the
01044                 // parent items, testing the bitfield for the existence of the parent (N times a constant Tb), or by iterating over the
01045                 // bitfield (restricted to the range of the sorted list of places), using a binary search to locate the leaf in the
01046                 // sorted list of places (M times log(N) testing operations Ts)
01047                 // Assuming that the time to test a bit is roughly the same of the time needed to compute the average of two integers,
01048                 // plus a couple of comparisons and additions, we compare N agains M*log(N) to decide which algorithm should be faster given
01049                 // the two sets
01050                 if(fNumItems <= setT->getBitCountInRange(fLeafIndexes[1], fLeafIndexes[fNumItems])*log((float)fNumItems))
01051                 {
01052                     for(unsigned int i=1; i<=fNumItems; ++i)
01053                         if(setT->getBit(fLeafIndexes[i]))
01054                         {
01055                             //
01056                             //  If this leaf is the current input symbol, then we
01057                             //  want to add its follow list to the set of states to
01058                             //  transition to from the current state.
01059                             //
01060                             *newSet |= *fFollowList[ fLeafIndexes[i] ];
01061                         }
01062                 }
01063                 else
01064                 {
01065                     // Further optimization: given that the bitfield enumerator returns the numbers in order,
01066                     // every time we raise the lower marker we know it will true also for the next bits, so
01067                     // the next binary search will not start from 1 but from this index
01068                     unsigned int lowIndex = 1;
01069                     // Start the enumerator from the first index in the sorted list of places,
01070                     // as nothing before that point will match
01071                     CMStateSetEnumerator enumBits(setT, fLeafIndexes[1]);
01072                     while(enumBits.hasMoreElements())
01073                     {
01074                         unsigned int bitIndex=enumBits.nextElement();
01075                         // if this leaf is greater than the last index in the sorted list of places,
01076                         // nothing can be found from now on, so get out of here
01077                         if(bitIndex > fLeafIndexes[fNumItems])
01078                             break;
01079 
01080                         // Check if this leaf index (DFA position) is in the current set
01081                         // (using binary search: the indexes are sorted)
01082                         unsigned int first=lowIndex,last=fNumItems,i;
01083                         while(first<=last)
01084                         {
01085                             i=(first+last)/2;
01086                             if(fLeafIndexes[i]>bitIndex)
01087                                 last=i-1;
01088                             else if(fLeafIndexes[i]<bitIndex)
01089                                 lowIndex=first=i+1;
01090                             else
01091                             {
01092                                 //
01093                                 //  If this leaf is the current input symbol, then we
01094                                 //  want to add its follow list to the set of states to
01095                                 //  transition to from the current state.
01096                                 //
01097                                 *newSet |= *fFollowList[bitIndex];
01098                                 break;
01099                             }
01100                         }
01101                     }
01102                 }
01103             }
01104 
01105             //
01106             //  If this new set is not empty, then see if its in the list
01107             //  of states to do. If not, then add it.
01108             //
01109             if (!newSet->isEmpty())
01110             {
01111                 //
01112                 //  Search the 'states to do' list to see if this new
01113                 //  state set is already in there.
01114                 //
01115                 /***
01116                 unsigned int stateIndex = 0;
01117                 for (; stateIndex < curState; stateIndex++)
01118                 {
01119                     if (*statesToDo[stateIndex] == *newSet)
01120                         break;
01121                 }
01122                 ***/
01123 
01124                 XMLInteger *stateObj = stateTable->get(newSet);
01125                 unsigned int stateIndex = (stateObj == 0 ? curState : stateObj->intValue());
01126 
01127                 // If we did not find it, then add it
01128                 if (stateIndex == curState)
01129                 {
01130                     //
01131                     //  Put this new state into the states to do and init
01132                     //  a new entry at the same index in the transition
01133                     //  table.
01134                     //
01135                     statesToDo[curState] = newSet;
01136                     fTransTable[curState] = makeDefStateList();
01137                     stateTable->put
01138                     (
01139                         newSet
01140                         , new (fMemoryManager) XMLInteger(curState)
01141                     );
01142 
01143                     // We now have a new state to do so bump the count
01144                     curState++;
01145 
01146                     //
01147                     //  Null out the new set to indicate we adopted it. This
01148                     //  will cause the creation of a new set on the next time
01149                     //  around the loop.
01150                     //
01151                     newSet = 0;
01152                 }
01153 
01154                 //
01155                 //  Now set this state in the transition table's entry for this
01156                 //  element (using its index), with the DFA state we will move
01157                 //  to from the current state when we see this input element.
01158                 //
01159                 transEntry[elemIndex] = stateIndex;
01160 
01161                 // Expand the arrays if we're full
01162                 if (curState == curArraySize)
01163                 {
01164                     //
01165                     //  Yikes, we overflowed the initial array size, so we've
01166                     //  got to expand all of these arrays. So adjust up the
01167                     //  size by 50% and allocate new arrays.
01168                     //
01169                     const unsigned int newSize = (unsigned int)(curArraySize * 1.5);
01170                     CMStateSet** newToDo = (CMStateSet**)
01171                         fMemoryManager->allocate
01172                         (
01173                             newSize * sizeof(CMStateSet*)
01174                         ); //new const CMStateSet*[newSize];
01175                     bool* newFinalFlags = (bool*) fMemoryManager->allocate
01176                     (
01177                         newSize * sizeof(bool)
01178                     ); //new bool[newSize];
01179                     unsigned int** newTransTable = (unsigned int**)
01180                         fMemoryManager->allocate
01181                         (
01182                             newSize * sizeof(unsigned int*)
01183                         ); //new unsigned int*[newSize];
01184 
01185                     // Copy over all of the existing content
01186                     for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++)
01187                     {
01188                         newToDo[expIndex] = statesToDo[expIndex];
01189                         newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
01190                         newTransTable[expIndex] = fTransTable[expIndex];
01191                     }
01192 
01193                     // Clean up the old stuff
01194                     fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
01195                     fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags;
01196                     fMemoryManager->deallocate(fTransTable); //delete [] fTransTable;
01197 
01198                     // Store the new array size and pointers
01199                     curArraySize = newSize;
01200                     statesToDo = newToDo;
01201                     fFinalStateFlags = newFinalFlags;
01202                     fTransTable = newTransTable;
01203                 } //if (curState == curArraySize)
01204             } //if (!newSet->isEmpty())
01205         } // for elemIndex
01206     } //while
01207 
01208     // Store the current state count in the trans table size
01209     fTransTableSize = curState;
01210 
01211     //
01212     // Fill in the occurence information for each looping state
01213     // if we're using counters.
01214     //
01215     if (elemOccurenceMap != 0) {
01216         fCountingStates = (Occurence**)fMemoryManager->allocate(fTransTableSize*sizeof(Occurence));
01217         memset(fCountingStates, 0, fTransTableSize*sizeof(Occurence*));
01218         for (unsigned int i = 0; i < fTransTableSize; ++i) {
01219             unsigned int * transitions = fTransTable[i];
01220             for (unsigned int j = 0; j < fElemMapSize; ++j) {
01221                 if (i == transitions[j]) {
01222                     Occurence* old=elemOccurenceMap[j];
01223                     if(old!=0)
01224                         fCountingStates[i] = new (fMemoryManager) Occurence(old->minOccurs, old->maxOccurs, old->elemIndex);
01225                     break;
01226                 }
01227             }
01228         }
01229         for (unsigned int j = 0; j < fLeafCount; ++j) {
01230             if(elemOccurenceMap[j]!=0)
01231                 delete elemOccurenceMap[j];
01232         }
01233         fMemoryManager->deallocate(elemOccurenceMap);
01234     }
01235 
01236     // If the last temp set was not stored, then clean it up
01237     if (newSet)
01238         delete newSet;
01239 
01240     //
01241     //  Now we can clean up all of the temporary data that was needed during
01242     //  DFA build.
01243     //
01244 
01245     for (index = 0; index < fLeafCount; index++)
01246         delete fFollowList[index];
01247     fMemoryManager->deallocate(fFollowList); //delete [] fFollowList;
01248 
01249     //
01250     // removeAll() will delete all data, XMLInteger,
01251     // while the keys are to be deleted by the
01252     // deletion of statesToDo.
01253     //
01254     delete stateTable;
01255 
01256     for (index = 0; index < curState; index++)
01257         delete statesToDo[index];
01258     fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
01259 
01260     for (index = 0; index < fLeafCount; index++)
01261         delete fLeafList[index];
01262     fMemoryManager->deallocate(fLeafList); //delete [] fLeafList;
01263 
01264 #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
01265     fMemoryManager->deallocate(fLeafSorter); //delete [] fLeafSorter;
01266 #endif
01267     for (index=0; index < fElemMapSize; index++)
01268         fMemoryManager->deallocate(fLeafSorter[index]);
01269     fMemoryManager->deallocate(fLeafSorter);
01270 }
01271 
01272 unsigned int DFAContentModel::countLeafNodes(ContentSpecNode* const curNode)
01273 {
01274     unsigned int count = 0;
01275 
01276     // Get the spec type of the passed node
01277     const ContentSpecNode::NodeTypes curType = curNode->getType();
01278 
01279     if ((curType & 0x0f) == ContentSpecNode::Any
01280         || (curType & 0x0f) == ContentSpecNode::Any_Other
01281         || (curType & 0x0f) == ContentSpecNode::Any_NS
01282         || curType == ContentSpecNode::Leaf
01283         || curType == ContentSpecNode::Loop)
01284     {
01285         count++;
01286     }
01287     else
01288     {
01289         //
01290         //  Its not a leaf, so we have to recurse its left and maybe right
01291         //  nodes. Save both values before we recurse and trash the node.
01292         //
01293         ContentSpecNode* leftNode = curNode->getFirst();
01294         ContentSpecNode* rightNode = curNode->getSecond();
01295 
01296         // Detect if we have a deep tree that can be analyzed using a loop instead of recursion
01297         unsigned int nLoopCount=0;
01298         ContentSpecNode* cursor=curNode;
01299         while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode)
01300         {
01301             nLoopCount++;
01302             cursor=cursor->getFirst();
01303         }
01304         if(nLoopCount!=0)
01305         {
01306             count += countLeafNodes(cursor);
01307             for(unsigned int i=0;i<nLoopCount;i++)
01308                 count += countLeafNodes(rightNode);
01309             return count;
01310         }
01311         if(leftNode)
01312             count+=countLeafNodes(leftNode);
01313         if(rightNode)
01314             count+=countLeafNodes(rightNode);
01315     }
01316     return count;
01317 }
01318 
01319 CMNode* DFAContentModel::buildSyntaxTree(ContentSpecNode* const curNode
01320                                          , unsigned int&        curIndex)
01321 {
01322     // Initialize a return node pointer
01323     CMNode* retNode = 0;
01324 
01325     // Get the spec type of the passed node
01326     const ContentSpecNode::NodeTypes curType = curNode->getType();
01327 
01328     if ((curType & 0x0f) == ContentSpecNode::Any
01329         || (curType & 0x0f) == ContentSpecNode::Any_Other
01330         || (curType & 0x0f) == ContentSpecNode::Any_NS)
01331     {
01332         retNode = new (fMemoryManager) CMAny
01333         (
01334             curType
01335             , curNode->getElement()->getURI()
01336             , curIndex
01337             , fLeafCount
01338             , fMemoryManager
01339         );
01340         fLeafList[curIndex] = new (fMemoryManager) CMLeaf
01341         (
01342             new (fMemoryManager) QName
01343             (
01344                 XMLUni::fgZeroLenString
01345                 , XMLUni::fgZeroLenString
01346                 , curNode->getElement()->getURI()
01347                 , fMemoryManager
01348             )
01349             , curIndex
01350             , true
01351             , fLeafCount
01352             , fMemoryManager
01353         );
01354         fLeafListType[curIndex] = curType;
01355         ++curIndex;
01356     }
01357     else if (curType == ContentSpecNode::Leaf)
01358     {
01359         //
01360         //  Create a new leaf node, and pass it the current leaf count, which
01361         //  is its DFA state position. Bump the leaf count after storing it.
01362         //  This makes the positions zero based since we store first and then
01363         //  increment.
01364         //
01365         retNode = new (fMemoryManager) CMLeaf
01366         (
01367             curNode->getElement()
01368             , curIndex
01369             , fLeafCount
01370             , fMemoryManager
01371         );
01372         fLeafList[curIndex] = new (fMemoryManager) CMLeaf
01373         (
01374             curNode->getElement()
01375             , curIndex
01376             , fLeafCount
01377             , fMemoryManager
01378         );
01379         fLeafListType[curIndex] = ContentSpecNode::Leaf;
01380         ++curIndex;
01381     }
01382     else if (curType == ContentSpecNode::Loop)
01383     {
01384         //
01385         //  Create a new leaf node, and pass it the current leaf count, which
01386         //  is its DFA state position. Bump the leaf count after storing it.
01387         //  This makes the positions zero based since we store first and then
01388         //  increment.
01389         //
01390         retNode = new (fMemoryManager) CMRepeatingLeaf
01391         (
01392             curNode->getFirst()->getElement()
01393             , curNode->getMinOccurs()
01394             , curNode->getMaxOccurs()
01395             , curIndex
01396             , fLeafCount
01397             , fMemoryManager
01398         );
01399         fLeafList[curIndex] = new (fMemoryManager) CMRepeatingLeaf
01400         (
01401             curNode->getFirst()->getElement()
01402             , curNode->getMinOccurs()
01403             , curNode->getMaxOccurs()
01404             , curIndex
01405             , fLeafCount
01406             , fMemoryManager
01407         );
01408         fLeafListType[curIndex] = curNode->getFirst()->getType();
01409         ++curIndex;
01410     }
01411      else
01412     {
01413         //
01414         //  Its not a leaf, so we have to recurse its left and maybe right
01415         //  nodes. Save both values before we recurse and trash the node.
01416         //
01417         ContentSpecNode* leftNode = curNode->getFirst();
01418         ContentSpecNode* rightNode = curNode->getSecond();
01419 
01420         // Detect if we have a deep tree that can be analyzed using a loop instead of recursion
01421         unsigned int nLoopCount=0;
01422         ContentSpecNode* cursor=curNode;
01423         while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode)
01424         {
01425             nLoopCount++;
01426             cursor=cursor->getFirst();
01427         }
01428         if(nLoopCount!=0)
01429         {
01430             retNode = buildSyntaxTree(cursor, curIndex);
01431             for(unsigned int i=0;i<nLoopCount;i++)
01432             {
01433                 CMNode* newRight = buildSyntaxTree(rightNode, curIndex);
01434                 //
01435                 //  Now handle our level. We use our left child's last pos set and our
01436                 //  right child's first pos set, so get them now for convenience.
01437                 //
01438                 const CMStateSet& last  = retNode->getLastPos();
01439                 const CMStateSet& first = newRight->getFirstPos();
01440 
01441                 //
01442                 //  Now, for every position which is in our left child's last set
01443                 //  add all of the states in our right child's first set to the
01444                 //  follow set for that position.
01445                 //
01446                 CMStateSetEnumerator enumLast(&last);
01447                 while(enumLast.hasMoreElements())
01448                 {
01449                     XMLSize_t index=enumLast.nextElement();
01450                     *fFollowList[index] |= first;
01451                 }
01452                 retNode = new (fMemoryManager) CMBinaryOp
01453                 (
01454                     ContentSpecNode::Sequence
01455                     , retNode
01456                     , newRight
01457                     , fLeafCount
01458                     , fMemoryManager
01459                 );
01460             }
01461             return retNode;
01462         }
01463 
01464         if (((curType & 0x0f) == ContentSpecNode::Choice)
01465         ||   ((curType & 0x0f) == ContentSpecNode::Sequence))
01466         {
01467             //
01468             //  Recurse on both children, and return a binary op node with the
01469             //  two created sub nodes as its children. The node type is the
01470             //  same type as the source.
01471             //
01472             CMNode* newLeft = buildSyntaxTree(leftNode, curIndex);
01473             CMNode* newRight = buildSyntaxTree(rightNode, curIndex);
01474             if(((curType & 0x0f) == ContentSpecNode::Sequence))
01475             {
01476                 //
01477                 //  Now handle our level. We use our left child's last pos set and our
01478                 //  right child's first pos set, so get them now for convenience.
01479                 //
01480                 const CMStateSet& last  = newLeft->getLastPos();
01481                 const CMStateSet& first = newRight->getFirstPos();
01482 
01483                 //
01484                 //  Now, for every position which is in our left child's last set
01485                 //  add all of the states in our right child's first set to the
01486                 //  follow set for that position.
01487                 //
01488                 CMStateSetEnumerator enumLast(&last);
01489                 while(enumLast.hasMoreElements())
01490                 {
01491                     XMLSize_t index=enumLast.nextElement();
01492                     *fFollowList[index] |= first;
01493                 }
01494             }
01495             retNode = new (fMemoryManager) CMBinaryOp
01496             (
01497                 curType
01498                 , newLeft
01499                 , newRight
01500                 , fLeafCount
01501                 , fMemoryManager
01502             );
01503         }
01504          else if (curType == ContentSpecNode::ZeroOrMore
01505                || curType == ContentSpecNode::ZeroOrOne
01506                || curType == ContentSpecNode::OneOrMore)
01507         {
01508             CMNode* newChild = buildSyntaxTree(leftNode, curIndex);
01509             if (curType == ContentSpecNode::ZeroOrMore
01510                || curType == ContentSpecNode::OneOrMore)
01511             {
01512                 //
01513                 //  Now handle our level. We use our own first and last position
01514                 //  sets, so get them up front.
01515                 //
01516                 const CMStateSet& first = newChild->getFirstPos();
01517                 const CMStateSet& last  = newChild->getLastPos();
01518 
01519                 //
01520                 //  For every position which is in our last position set, add all
01521                 //  of our first position states to the follow set for that
01522                 //  position.
01523                 //
01524                 CMStateSetEnumerator enumLast(&last);
01525                 while(enumLast.hasMoreElements())
01526                 {
01527                     XMLSize_t index=enumLast.nextElement();
01528                     *fFollowList[index] |= first;
01529                 }
01530             }
01531             // This one is fine as is, just change to our form
01532             retNode = new (fMemoryManager) CMUnaryOp
01533             (
01534                 curType
01535                 , newChild
01536                 , fLeafCount
01537                 , fMemoryManager
01538             );
01539         }
01540          else
01541         {
01542             ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::CM_UnknownCMSpecType, fMemoryManager);
01543         }
01544     }
01545     // fault in the first and last pos, then delete it children
01546     retNode->getFirstPos();
01547     retNode->getLastPos();
01548     retNode->orphanChild();
01549     return retNode;
01550 }
01551 
01552 //
01553 //  gInvalidTrans is used to represent bad transitions in the transition table
01554 //  entry for each state. So each entry is initialized to that value. This
01555 //  method creates a new entry and initializes it.
01556 //
01557 unsigned int* DFAContentModel::makeDefStateList() const
01558 {
01559     unsigned int* retArray = (unsigned int*) fMemoryManager->allocate
01560     (
01561         fElemMapSize * sizeof(unsigned int)
01562     ); //new unsigned int[fElemMapSize];
01563     for (unsigned int index = 0; index < fElemMapSize; index++)
01564         retArray[index] = XMLContentModel::gInvalidTrans;
01565     return retArray;
01566 }
01567 
01568 ContentLeafNameTypeVector* DFAContentModel::getContentLeafNameTypeVector() const
01569 {
01570    //later change it to return the data member
01571     return fLeafNameTypeVector;
01572 }
01573 
01574 void DFAContentModel::checkUniqueParticleAttribution (SchemaGrammar*    const pGrammar,
01575                                                       GrammarResolver*  const pGrammarResolver,
01576                                                       XMLStringPool*    const pStringPool,
01577                                                       XMLValidator*     const pValidator,
01578                                                       unsigned int*     const pContentSpecOrgURI,
01579                                                       const XMLCh*            pComplexTypeName /*= 0*/)
01580 {
01581 
01582     SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool);
01583 
01584     unsigned int i, j, k;
01585 
01586     // Rename the URI back
01587     for (i = 0; i < fElemMapSize; i++) {
01588 
01589         unsigned int orgURIIndex = fElemMap[i]->getURI();
01590 
01591         if ((orgURIIndex != XMLContentModel::gEOCFakeId) &&
01592             (orgURIIndex != XMLContentModel::gEpsilonFakeId) &&
01593             (orgURIIndex != XMLElementDecl::fgInvalidElemId) &&
01594             (orgURIIndex != XMLElementDecl::fgPCDataElemId)) {
01595             fElemMap[i]->setURI(pContentSpecOrgURI[orgURIIndex]);
01596         }
01597     }
01598 
01599     // Unique Particle Attribution
01600     // Store the conflict results between any two elements in fElemMap
01601     // 0 - not yet tested, 1 - conflict, (-1) - no conflict
01602     signed char** conflictTable = (signed char**) fMemoryManager->allocate
01603     (
01604         fElemMapSize * sizeof(signed char*)
01605     );
01606 
01607     // initialize the conflict table
01608     for (j = 0; j < fElemMapSize; j++) {
01609         conflictTable[j] = (signed char*) fMemoryManager->allocate
01610         (
01611             fElemMapSize * sizeof(signed char)
01612         );
01613         memset(conflictTable[j], 0, fElemMapSize*sizeof(signed char));
01614     }
01615 
01616     // for each state, check whether it has overlap transitions
01617     for (i = 0; i < fTransTableSize; i++) {
01618         for (j = 0; j < fElemMapSize; j++) {
01619             for (k = j+1; k < fElemMapSize; k++) {
01620                 if (fTransTable[i][j] != XMLContentModel::gInvalidTrans &&
01621                     fTransTable[i][k] != XMLContentModel::gInvalidTrans &&
01622                     conflictTable[j][k] == 0) {
01623 
01624                     // If this is text in a Schema mixed content model, skip it.
01625                     if ( fIsMixed &&
01626                          (( fElemMap[j]->getURI() == XMLElementDecl::fgPCDataElemId) ||
01627                           ( fElemMap[k]->getURI() == XMLElementDecl::fgPCDataElemId)))
01628                         continue;
01629 
01630                     if (XercesElementWildcard::conflict(pGrammar,
01631                                                         fElemMapType[j],
01632                                                         fElemMap[j],
01633                                                         fElemMapType[k],
01634                                                         fElemMap[k],
01635                                                         &comparator)) {
01636                         if (fCountingStates != 0) {
01637                             Occurence* o = fCountingStates[i];
01638                             // If "i" is a counting state and exactly one of the transitions
01639                             // loops back to "i" then the two particles do not overlap if
01640                             // minOccurs == maxOccurs.
01641                             if (o != 0 &&
01642                                 ((fTransTable[i][j] == i) ^ (fTransTable[i][k] == i)) &&
01643                                 o->minOccurs == o->maxOccurs) {
01644                                 conflictTable[j][k] = -1;
01645                                 continue;
01646                             }
01647                         }
01648                        conflictTable[j][k] = 1;
01649 
01650                        XMLBuffer buf1(1023, fMemoryManager);
01651                        if (((fElemMapType[j] & 0x0f) == ContentSpecNode::Any) ||
01652                            ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_NS))
01653                            buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY);
01654                        else if ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_Other)
01655                            buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER);
01656                        else
01657                            buf1.set(fElemMap[j]->getRawName());
01658 
01659                        XMLBuffer buf2(1023, fMemoryManager);
01660                        if (((fElemMapType[k] & 0x0f) == ContentSpecNode::Any) ||
01661                            ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_NS))
01662                            buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY);
01663                        else if ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_Other)
01664                            buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER);
01665                        else
01666                            buf2.set(fElemMap[k]->getRawName());
01667 
01668                        pValidator->emitError(XMLValid::UniqueParticleAttributionFail,
01669                                              pComplexTypeName,
01670                                              buf1.getRawBuffer(),
01671                                              buf2.getRawBuffer());
01672                     }
01673                     else
01674                         conflictTable[j][k] = -1;
01675                 }
01676             }
01677         }
01678     }
01679 
01680     for (i = 0; i < fElemMapSize; i++)
01681         fMemoryManager->deallocate(conflictTable[i]);
01682     fMemoryManager->deallocate(conflictTable);
01683 }
01684 
01685 XERCES_CPP_NAMESPACE_END