GME  13
WFXMLScanner.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019   * $Id: WFXMLScanner.cpp 833045 2009-11-05 13:21:27Z borisk $
00020  */
00021 
00022 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/internal/WFXMLScanner.hpp>
00027 #include <xercesc/util/Janitor.hpp>
00028 #include <xercesc/util/RuntimeException.hpp>
00029 #include <xercesc/util/UnexpectedEOFException.hpp>
00030 #include <xercesc/sax/InputSource.hpp>
00031 #include <xercesc/framework/XMLDocumentHandler.hpp>
00032 #include <xercesc/framework/XMLEntityHandler.hpp>
00033 #include <xercesc/framework/XMLPScanToken.hpp>
00034 #include <xercesc/framework/XMLValidityCodes.hpp>
00035 #include <xercesc/internal/EndOfEntityException.hpp>
00036 #include <xercesc/util/OutOfMemoryException.hpp>
00037 
00038 XERCES_CPP_NAMESPACE_BEGIN
00039 
00040 // ---------------------------------------------------------------------------
00041 //  WFXMLScanner: Constructors and Destructor
00042 // ---------------------------------------------------------------------------
00043 
00044 
00045 typedef JanitorMemFunCall<WFXMLScanner> CleanupType;
00046 typedef JanitorMemFunCall<ReaderMgr>    ReaderMgrResetType;
00047 
00048 
00049 WFXMLScanner::WFXMLScanner( XMLValidator* const  valToAdopt
00050                           , GrammarResolver* const grammarResolver
00051                           , MemoryManager* const manager) :
00052 
00053     XMLScanner(valToAdopt, grammarResolver, manager)
00054     , fElementIndex(0)
00055     , fElements(0)
00056     , fEntityTable(0)
00057     , fAttrNameHashList(0)
00058     , fAttrNSList(0)
00059     , fElementLookup(0)
00060 {
00061     CleanupType cleanup(this, &WFXMLScanner::cleanUp);
00062 
00063     try
00064     {
00065         commonInit();
00066     }
00067     catch(const OutOfMemoryException&)
00068     {
00069         // Don't cleanup when out of memory, since executing the
00070         // code can cause problems.
00071         cleanup.release();
00072 
00073         throw;
00074     }
00075 
00076     cleanup.release();
00077 }
00078 
00079 WFXMLScanner::WFXMLScanner( XMLDocumentHandler* const docHandler
00080                           , DocTypeHandler* const     docTypeHandler
00081                           , XMLEntityHandler* const   entityHandler
00082                           , XMLErrorReporter* const   errHandler
00083                           , XMLValidator* const       valToAdopt
00084                           , GrammarResolver* const    grammarResolver
00085                           , MemoryManager* const      manager) :
00086 
00087     XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager)
00088     , fElementIndex(0)
00089     , fElements(0)
00090     , fEntityTable(0)
00091     , fAttrNameHashList(0)
00092     , fAttrNSList(0)
00093     , fElementLookup(0)
00094 {
00095     CleanupType cleanup(this, &WFXMLScanner::cleanUp);
00096 
00097     try
00098     {   
00099         commonInit();
00100     }
00101     catch(const OutOfMemoryException&)
00102     {
00103         // Don't cleanup when out of memory, since executing the
00104         // code can cause problems.
00105         cleanup.release();
00106 
00107         throw;
00108     }
00109 
00110     cleanup.release();
00111 }
00112 
00113 WFXMLScanner::~WFXMLScanner()
00114 {
00115     cleanUp();
00116 }
00117 
00118 // ---------------------------------------------------------------------------
00119 //  XMLScanner: Getter methods
00120 // ---------------------------------------------------------------------------
00121 NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool()
00122 {
00123     return 0;
00124 }
00125 
00126 const NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() const
00127 {
00128     return 0;
00129 }
00130 
00131 // ---------------------------------------------------------------------------
00132 //  WFXMLScanner: Main entry point to scan a document
00133 // ---------------------------------------------------------------------------
00134 void WFXMLScanner::scanDocument(const InputSource& src)
00135 {
00136     //  Bump up the sequence id for this parser instance. This will invalidate
00137     //  any previous progressive scan tokens.
00138     fSequenceId++;
00139 
00140     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
00141 
00142     try
00143     {
00144         //  Reset the scanner and its plugged in stuff for a new run. This
00145         //  resets all the data structures, creates the initial reader and
00146         //  pushes it on the stack, and sets up the base document path.
00147         scanReset(src);
00148 
00149         // If we have a document handler, then call the start document
00150         if (fDocHandler)
00151             fDocHandler->startDocument();
00152 
00153         //  Scan the prolog part, which is everything before the root element
00154         //  including the DTD subsets.
00155         scanProlog();
00156 
00157         //  If we got to the end of input, then its not a valid XML file.
00158         //  Else, go on to scan the content.
00159         if (fReaderMgr.atEOF())
00160         {
00161             emitError(XMLErrs::EmptyMainEntity);
00162         }
00163         else
00164         {
00165             // Scan content, and tell it its not an external entity
00166             if (scanContent())
00167             {
00168                 // That went ok, so scan for any miscellaneous stuff
00169                 if (!fReaderMgr.atEOF())
00170                     scanMiscellaneous();
00171             }
00172         }
00173 
00174         // If we have a document handler, then call the end document
00175         if (fDocHandler)
00176             fDocHandler->endDocument();
00177     }
00178     //  NOTE:
00179     //
00180     //  In all of the error processing below, the emitError() call MUST come
00181     //  before the flush of the reader mgr, or it will fail because it tries
00182     //  to find out the position in the XML source of the error.
00183     catch(const XMLErrs::Codes)
00184     {
00185         // This is a 'first failure' exception, so fall through
00186     }
00187     catch(const XMLValid::Codes)
00188     {
00189         // This is a 'first fatal error' type exit, so fall through
00190     }
00191     catch(const XMLException& excToCatch)
00192     {
00193         //  Emit the error and catch any user exception thrown from here. Make
00194         //  sure in all cases we flush the reader manager.
00195         fInException = true;
00196         try
00197         {
00198             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
00199                 emitError
00200                 (
00201                     XMLErrs::XMLException_Warning
00202                     , excToCatch.getCode()
00203                     , excToCatch.getMessage()
00204                 );
00205             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
00206                 emitError
00207                 (
00208                     XMLErrs::XMLException_Fatal
00209                     , excToCatch.getCode()
00210                     , excToCatch.getMessage()
00211                 );
00212             else
00213                 emitError
00214                 (
00215                     XMLErrs::XMLException_Error
00216                     , excToCatch.getCode()
00217                     , excToCatch.getMessage()
00218                 );
00219         }
00220         catch(const OutOfMemoryException&)
00221         {
00222             // This is a special case for out-of-memory
00223             // conditions, because resetting the ReaderMgr
00224             // can be problematic.
00225             resetReaderMgr.release();
00226 
00227             throw;
00228         }
00229     }
00230     catch(const OutOfMemoryException&)
00231     {
00232         // This is a special case for out-of-memory
00233         // conditions, because resetting the ReaderMgr
00234         // can be problematic.
00235         resetReaderMgr.release();
00236 
00237         throw;
00238     }
00239 }
00240 
00241 
00242 bool WFXMLScanner::scanNext(XMLPScanToken& token)
00243 {
00244     // Make sure this token is still legal
00245     if (!isLegalToken(token))
00246         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
00247 
00248     // Find the next token and remember the reader id
00249     XMLSize_t orgReader;
00250     XMLTokens curToken;
00251     bool retVal = true;
00252 
00253     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
00254 
00255     try
00256     {
00257         while (true)
00258         {
00259             //  We have to handle any end of entity exceptions that happen here.
00260             //  We could be at the end of X nested entities, each of which will
00261             //  generate an end of entity exception as we try to move forward.
00262             try
00263             {
00264                 curToken = senseNextToken(orgReader);
00265                 break;
00266             }
00267             catch(const EndOfEntityException& toCatch)
00268             {
00269                 // Send an end of entity reference event
00270                 if (fDocHandler)
00271                     fDocHandler->endEntityReference(toCatch.getEntity());
00272             }
00273         }
00274 
00275         if (curToken == Token_CharData)
00276         {
00277             scanCharData(fCDataBuf);
00278         }
00279         else if (curToken == Token_EOF)
00280         {
00281             if (!fElemStack.isEmpty())
00282             {
00283                 const ElemStack::StackElem* topElem = fElemStack.popTop();
00284                 emitError
00285                 (
00286                     XMLErrs::EndedWithTagsOnStack
00287                     , topElem->fThisElement->getFullName()
00288                 );
00289             }
00290 
00291             retVal = false;
00292         }
00293         else
00294         {
00295             // Its some sort of markup
00296             bool gotData = true;
00297             switch(curToken)
00298             {
00299                 case Token_CData :
00300                     // Make sure we are within content
00301                     if (fElemStack.isEmpty())
00302                         emitError(XMLErrs::CDATAOutsideOfContent);
00303                     scanCDSection();
00304                     break;
00305 
00306                 case Token_Comment :
00307                     scanComment();
00308                     break;
00309 
00310                 case Token_EndTag :
00311                     scanEndTag(gotData);
00312                     break;
00313 
00314                 case Token_PI :
00315                     scanPI();
00316                     break;
00317 
00318                 case Token_StartTag :
00319                     if (fDoNamespaces)
00320                         scanStartTagNS(gotData);
00321                     else
00322                         scanStartTag(gotData);
00323                     break;
00324 
00325                 default :
00326                     fReaderMgr.skipToChar(chOpenAngle);
00327                     break;
00328             }
00329 
00330             if (orgReader != fReaderMgr.getCurrentReaderNum())
00331                 emitError(XMLErrs::PartialMarkupInEntity);
00332 
00333             // If we hit the end, then do the miscellaneous part
00334             if (!gotData)
00335             {
00336                 // That went ok, so scan for any miscellaneous stuff
00337                 scanMiscellaneous();
00338 
00339                 if (fDocHandler)
00340                     fDocHandler->endDocument();
00341             }
00342         }
00343     }
00344     //  NOTE:
00345     //
00346     //  In all of the error processing below, the emitError() call MUST come
00347     //  before the flush of the reader mgr, or it will fail because it tries
00348     //  to find out the position in the XML source of the error.
00349     catch(const XMLErrs::Codes)
00350     {
00351         // This is a 'first failure' exception, so return failure
00352         retVal = false;
00353     }
00354     catch(const XMLValid::Codes)
00355     {
00356         // This is a 'first fatal error' type exit, so return failure
00357         retVal = false;
00358     }
00359     catch(const XMLException& excToCatch)
00360     {
00361         //  Emit the error and catch any user exception thrown from here. Make
00362         //  sure in all cases we flush the reader manager.
00363         fInException = true;
00364         try
00365         {
00366             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
00367                 emitError
00368                 (
00369                     XMLErrs::XMLException_Warning
00370                     , excToCatch.getCode()
00371                     , excToCatch.getMessage()
00372                 );
00373             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
00374                 emitError
00375                 (
00376                     XMLErrs::XMLException_Fatal
00377                     , excToCatch.getCode()
00378                     , excToCatch.getMessage()
00379                 );
00380             else
00381                 emitError
00382                 (
00383                     XMLErrs::XMLException_Error
00384                     , excToCatch.getCode()
00385                     , excToCatch.getMessage()
00386                 );
00387         }
00388         catch(const OutOfMemoryException&)
00389         {
00390             // This is a special case for out-of-memory
00391             // conditions, because resetting the ReaderMgr
00392             // can be problematic.
00393             resetReaderMgr.release();
00394 
00395             throw;
00396         }
00397 
00398         // Return failure
00399         retVal = false;
00400     }
00401     catch(const OutOfMemoryException&)
00402     {
00403         throw;
00404     }
00405 
00406     // If we are not at the end, release the object that will
00407     // reset the ReaderMgr.
00408     if (retVal)
00409         resetReaderMgr.release();
00410 
00411     return retVal;
00412 }
00413 
00414 
00415 
00416 // ---------------------------------------------------------------------------
00417 //  WFXMLScanner: Private helper methods.
00418 // ---------------------------------------------------------------------------
00419 
00420 //  This method handles the common initialization, to avoid having to do
00421 //  it redundantly in multiple constructors.
00422 void WFXMLScanner::commonInit()
00423 {
00424     fEntityTable = new (fMemoryManager) ValueHashTableOf<XMLCh>(11, fMemoryManager);
00425     fAttrNameHashList = new (fMemoryManager)ValueVectorOf<XMLSize_t>(16, fMemoryManager);
00426     fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager);
00427     fElements = new (fMemoryManager) RefVectorOf<XMLElementDecl>(32, true, fMemoryManager);
00428     fElementLookup = new (fMemoryManager) RefHashTableOf<XMLElementDecl>(109, false, fMemoryManager);
00429 
00430     //  Add the default entity entries for the character refs that must always
00431     //  be present.
00432     fEntityTable->put((void*) XMLUni::fgAmp, chAmpersand);
00433     fEntityTable->put((void*) XMLUni::fgLT, chOpenAngle);
00434     fEntityTable->put((void*) XMLUni::fgGT, chCloseAngle);
00435     fEntityTable->put((void*) XMLUni::fgQuot, chDoubleQuote);
00436     fEntityTable->put((void*) XMLUni::fgApos, chSingleQuote);
00437 }
00438 
00439 void WFXMLScanner::cleanUp()
00440 {
00441     delete fEntityTable;
00442     delete fAttrNameHashList;
00443     delete fAttrNSList;
00444     delete fElementLookup;
00445     delete fElements;
00446 }
00447 
00448 //  This method will reset the scanner data structures, and related plugged
00449 //  in stuff, for a new scan session. We get the input source for the primary
00450 //  XML entity, create the reader for it, and push it on the stack so that
00451 //  upon successful return from here we are ready to go.
00452 void WFXMLScanner::scanReset(const InputSource& src)
00453 {
00454     //  For all installed handlers, send reset events. This gives them
00455     //  a chance to flush any cached data.
00456     if (fDocHandler)
00457         fDocHandler->resetDocument();
00458     if (fEntityHandler)
00459         fEntityHandler->resetEntities();
00460     if (fErrorReporter)
00461         fErrorReporter->resetErrors();
00462 
00463     //  Reset the element stack, and give it the latest ids for the special
00464     //  URIs it has to know about.
00465     fElemStack.reset
00466     (
00467         fEmptyNamespaceId
00468         , fUnknownNamespaceId
00469         , fXMLNamespaceId
00470         , fXMLNSNamespaceId
00471     );
00472 
00473     // Reset some status flags
00474     fInException = false;
00475     fStandalone = false;
00476     fErrorCount = 0;
00477     fHasNoDTD = true;
00478     fElementIndex = 0;
00479 
00480     // Reset elements lookup table
00481     fElementLookup->removeAll();
00482 
00483     //  Handle the creation of the XML reader object for this input source.
00484     //  This will provide us with transcoding and basic lexing services.
00485     XMLReader* newReader = fReaderMgr.createReader
00486     (
00487         src
00488         , true
00489         , XMLReader::RefFrom_NonLiteral
00490         , XMLReader::Type_General
00491         , XMLReader::Source_External
00492         , fCalculateSrcOfs
00493         , fLowWaterMark
00494     );
00495 
00496     if (!newReader) {
00497         if (src.getIssueFatalErrorIfNotFound())
00498             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
00499         else
00500             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
00501     }
00502 
00503     // Push this read onto the reader manager
00504     fReaderMgr.pushReader(newReader, 0);
00505 
00506     // and reset security-related things if necessary:
00507     if(fSecurityManager != 0) 
00508     {
00509         fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
00510         fEntityExpansionCount = 0;
00511     }
00512 }
00513 
00514 //  This method is called between markup in content. It scans for character
00515 //  data that is sent to the document handler. It watches for any markup
00516 //  characters that would indicate that the character data has ended. It also
00517 //  handles expansion of general and character entities.
00518 //
00519 //  sendData() is a local static helper for this method which handles some
00520 //  code that must be done in three different places here.
00521 void WFXMLScanner::sendCharData(XMLBuffer& toSend)
00522 {
00523     // If no data in the buffer, then nothing to do
00524     if (toSend.isEmpty())
00525         return;
00526 
00527     // Always assume its just char data if not validating
00528     if (fDocHandler)
00529         fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
00530 
00531     // Reset buffer
00532     toSend.reset();
00533 }
00534 
00535 // ---------------------------------------------------------------------------
00536 //  WFXMLScanner: Private scanning methods
00537 // ---------------------------------------------------------------------------
00538 
00539 //  This method will kick off the scanning of the primary content of the
00540 //  document, i.e. the elements.
00541 bool WFXMLScanner::scanContent()
00542 {
00543     //  Go into a loop until we hit the end of the root element, or we fall
00544     //  out because there is no root element.
00545     //
00546     //  We have to do kind of a deeply nested double loop here in order to
00547     //  avoid doing the setup/teardown of the exception handler on each
00548     //  round. Doing it this way we only do it when an exception actually
00549     //  occurs.
00550     bool gotData = true;
00551     bool inMarkup = false;
00552     while (gotData)
00553     {
00554         try
00555         {
00556             while (gotData)
00557             {
00558                 //  Sense what the next top level token is. According to what
00559                 //  this tells us, we will call something to handle that kind
00560                 //  of thing.
00561                 XMLSize_t orgReader;
00562                 const XMLTokens curToken = senseNextToken(orgReader);
00563 
00564                 //  Handle character data and end of file specially. Char data
00565                 //  is not markup so we don't want to handle it in the loop
00566                 //  below.
00567                 if (curToken == Token_CharData)
00568                 {
00569                     //  Scan the character data and call appropriate events. Let
00570                     //  him use our local character data buffer for efficiency.
00571                     scanCharData(fCDataBuf);
00572                     continue;
00573                 }
00574                 else if (curToken == Token_EOF)
00575                 {
00576                     //  The element stack better be empty at this point or we
00577                     //  ended prematurely before all elements were closed.
00578                     if (!fElemStack.isEmpty())
00579                     {
00580                         const ElemStack::StackElem* topElem = fElemStack.popTop();
00581                         emitError
00582                         (
00583                             XMLErrs::EndedWithTagsOnStack
00584                             , topElem->fThisElement->getFullName()
00585                         );
00586                     }
00587 
00588                     // Its the end of file, so clear the got data flag
00589                     gotData = false;
00590                     continue;
00591                 }
00592 
00593                 // We are in some sort of markup now
00594                 inMarkup = true;
00595 
00596                 //  According to the token we got, call the appropriate
00597                 //  scanning method.
00598                 switch(curToken)
00599                 {
00600                     case Token_CData :
00601                         // Make sure we are within content
00602                         if (fElemStack.isEmpty())
00603                             emitError(XMLErrs::CDATAOutsideOfContent);
00604                         scanCDSection();
00605                         break;
00606 
00607                     case Token_Comment :
00608                         scanComment();
00609                         break;
00610 
00611                     case Token_EndTag :
00612                         scanEndTag(gotData);
00613                         break;
00614 
00615                     case Token_PI :
00616                         scanPI();
00617                         break;
00618 
00619                     case Token_StartTag :
00620                         if (fDoNamespaces)
00621                             scanStartTagNS(gotData);
00622                         else
00623                             scanStartTag(gotData);
00624                         break;
00625 
00626                     default :
00627                         fReaderMgr.skipToChar(chOpenAngle);
00628                         break;
00629                 }
00630 
00631                 if (orgReader != fReaderMgr.getCurrentReaderNum())
00632                     emitError(XMLErrs::PartialMarkupInEntity);
00633 
00634                 // And we are back out of markup again
00635                 inMarkup = false;
00636             }
00637         }
00638         catch(const EndOfEntityException& toCatch)
00639         {
00640             //  If we were in some markup when this happened, then its a
00641             //  partial markup error.
00642             if (inMarkup)
00643                 emitError(XMLErrs::PartialMarkupInEntity);
00644 
00645             // Send an end of entity reference event
00646             if (fDocHandler)
00647                 fDocHandler->endEntityReference(toCatch.getEntity());
00648 
00649             inMarkup = false;
00650         }
00651     }
00652 
00653     // It went ok, so return success
00654     return true;
00655 }
00656 
00657 
00658 void WFXMLScanner::scanEndTag(bool& gotData)
00659 {
00660     //  Assume we will still have data until proven otherwise. It will only
00661     //  ever be false if this is the end of the root element.
00662     gotData = true;
00663 
00664     //  Check if the element stack is empty. If so, then this is an unbalanced
00665     //  element (i.e. more ends than starts, perhaps because of bad text
00666     //  causing one to be skipped.)
00667     if (fElemStack.isEmpty())
00668     {
00669         emitError(XMLErrs::MoreEndThanStartTags);
00670         fReaderMgr.skipPastChar(chCloseAngle);
00671         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager);
00672     }
00673 
00674     //  Pop the stack of the element we are supposed to be ending. Remember
00675     //  that we don't own this. The stack just keeps them and reuses them.
00676     unsigned int uriId = (fDoNamespaces)
00677         ? fElemStack.getCurrentURI() : fEmptyNamespaceId;
00678     const ElemStack::StackElem* topElem = fElemStack.popTop();
00679 
00680     // See if it was the root element, to avoid multiple calls below
00681     const bool isRoot = fElemStack.isEmpty();
00682 
00683     // Make sure that its the end of the element that we expect
00684     if (!fReaderMgr.skippedStringLong(topElem->fThisElement->getFullName()))
00685     {
00686         emitError
00687         (
00688             XMLErrs::ExpectedEndOfTagX
00689             , topElem->fThisElement->getFullName()
00690         );
00691         fReaderMgr.skipPastChar(chCloseAngle);
00692         return;
00693     }
00694 
00695     // Make sure we are back on the same reader as where we started
00696     if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
00697         emitError(XMLErrs::PartialTagMarkupError);
00698 
00699     // Skip optional whitespace
00700     fReaderMgr.skipPastSpaces();
00701 
00702     // Make sure we find the closing bracket
00703     if (!fReaderMgr.skippedChar(chCloseAngle))
00704     {
00705         emitError
00706         (
00707             XMLErrs::UnterminatedEndTag
00708             , topElem->fThisElement->getFullName()
00709         );
00710     }
00711 
00712     // If we have a doc handler, tell it about the end tag
00713     if (fDocHandler)
00714     {
00715         fDocHandler->endElement
00716         (
00717             *topElem->fThisElement
00718             , uriId
00719             , isRoot
00720             , topElem->fThisElement->getElementName()->getPrefix()
00721         );
00722     }
00723 
00724     // If this was the root, then done with content
00725     gotData = !isRoot;
00726 }
00727 
00728 void WFXMLScanner::scanDocTypeDecl()
00729 {
00730     // Just skips over it
00731     // REVISIT: Should we issue a warning
00732     static const XMLCh doctypeIE[] =
00733     {
00734         chOpenSquare, chCloseAngle, chNull
00735     };
00736     XMLCh nextCh = fReaderMgr.skipUntilIn(doctypeIE);
00737 
00738     if (nextCh == chOpenSquare)
00739         fReaderMgr.skipPastChar(chCloseSquare);
00740 
00741     fReaderMgr.skipPastChar(chCloseAngle);
00742 }
00743 
00744 bool WFXMLScanner::scanStartTag(bool& gotData)
00745 {
00746     //  Assume we will still have data until proven otherwise. It will only
00747     //  ever be false if this is the root and its empty.
00748     gotData = true;
00749 
00750     //  Get the QName. In this case, we are not doing namespaces, so we just
00751     //  use it as is and don't have to break it into parts.
00752     if (!fReaderMgr.getName(fQNameBuf))
00753     {
00754         emitError(XMLErrs::ExpectedElementName);
00755         fReaderMgr.skipToChar(chOpenAngle);
00756         return false;
00757     }
00758 
00759     // Assume it won't be an empty tag
00760     bool isEmpty = false;
00761 
00762     // See if its the root element
00763     const bool isRoot = fElemStack.isEmpty();
00764 
00765     //  Lets try to look up the element
00766     const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
00767     XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
00768 
00769     if (!elemDecl) {
00770 
00771         if (fElementIndex < fElements->size()) {
00772             elemDecl = fElements->elementAt(fElementIndex);
00773         }
00774         else {
00775             elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
00776             (
00777                 fGrammarPoolMemoryManager
00778             );
00779             fElements->addElement(elemDecl);
00780         }
00781 
00782         elemDecl->setElementName(XMLUni::fgZeroLenString, qnameRawBuf, fEmptyNamespaceId);
00783         fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
00784         fElementIndex++;
00785     }
00786 
00787     // Expand the element stack and add the new element
00788     fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
00789 
00790     // Skip any whitespace after the name
00791     fReaderMgr.skipPastSpaces();
00792 
00793     //  We loop until we either see a /> or >, handling attribute/value
00794     //  pairs until we get there.
00795     XMLSize_t    attCount = 0;
00796     XMLSize_t    curAttListSize = fAttrList->size();
00797     while (true)
00798     {
00799         // And get the next non-space character
00800         XMLCh nextCh = fReaderMgr.peekNextChar();
00801 
00802         //  If the next character is not a slash or closed angle bracket,
00803         //  then it must be whitespace, since whitespace is required
00804         //  between the end of the last attribute and the name of the next
00805         //  one.
00806         if (attCount)
00807         {
00808             if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
00809             {
00810                 bool bFoundSpace;
00811                 fReaderMgr.skipPastSpaces(bFoundSpace);
00812                 if (!bFoundSpace)
00813                 {
00814                     // Emit the error but keep on going
00815                     emitError(XMLErrs::ExpectedWhitespace);
00816                 }
00817                 // Ok, peek another char
00818                 nextCh = fReaderMgr.peekNextChar();
00819             }
00820         }
00821 
00822         //  Ok, here we first check for any of the special case characters.
00823         //  If its not one, then we do the normal case processing, which
00824         //  assumes that we've hit an attribute value, Otherwise, we do all
00825         //  the special case checks.
00826         if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
00827         {
00828             //  Assume its going to be an attribute, so get a name from
00829             //  the input.
00830             if (!fReaderMgr.getName(fAttNameBuf))
00831             {
00832                 emitError(XMLErrs::ExpectedAttrName);
00833                 fReaderMgr.skipPastChar(chCloseAngle);
00834                 return false;
00835             }
00836 
00837             // And next must be an equal sign
00838             if (!scanEq())
00839             {
00840                 static const XMLCh tmpList[] =
00841                 {
00842                     chSingleQuote, chDoubleQuote, chCloseAngle
00843                     , chOpenAngle, chForwardSlash, chNull
00844                 };
00845 
00846                 emitError(XMLErrs::ExpectedEqSign);
00847 
00848                 //  Try to sync back up by skipping forward until we either
00849                 //  hit something meaningful.
00850                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
00851 
00852                 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
00853                 {
00854                     // Jump back to top for normal processing of these
00855                     continue;
00856                 }
00857                 else if ((chFound == chSingleQuote)
00858                       ||  (chFound == chDoubleQuote)
00859                       ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
00860                 {
00861                     // Just fall through assuming that the value is to follow
00862                 }
00863                 else if (chFound == chOpenAngle)
00864                 {
00865                     // Assume a malformed tag and that new one is starting
00866                     emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
00867                     return false;
00868                 }
00869                 else
00870                 {
00871                     // Something went really wrong
00872                     return false;
00873                 }
00874             }
00875 
00876             //  See if this attribute is declared more than one for this element.
00877             const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer();
00878             XMLSize_t attNameHash = XMLString::hash(attNameRawBuf, 109);
00879 
00880             if (attCount) {
00881 
00882                 for (XMLSize_t k=0; k < attCount; k++) {
00883 
00884                     if (fAttrNameHashList->elementAt(k) == attNameHash) {
00885                         if (
00886                                XMLString::equals
00887                                (
00888                                    fAttrList->elementAt(k)->getName()
00889                                    , attNameRawBuf
00890                                )
00891                            )
00892                         {
00893                             emitError
00894                             (
00895                                 XMLErrs::AttrAlreadyUsedInSTag
00896                                 , attNameRawBuf
00897                                 , qnameRawBuf
00898                             );
00899                             break;
00900                         }
00901                     }
00902                 }
00903             }
00904 
00905             //  Skip any whitespace before the value and then scan the att
00906             //  value. This will come back normalized with entity refs and
00907             //  char refs expanded.
00908             fReaderMgr.skipPastSpaces();
00909             if (!scanAttValue(attNameRawBuf, fAttValueBuf))
00910             {
00911                 static const XMLCh tmpList[] =
00912                 {
00913                     chCloseAngle, chOpenAngle, chForwardSlash, chNull
00914                 };
00915 
00916                 emitError(XMLErrs::ExpectedAttrValue);
00917 
00918                 //  It failed, so lets try to get synced back up. We skip
00919                 //  forward until we find some whitespace or one of the
00920                 //  chars in our list.
00921                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
00922 
00923                 if ((chFound == chCloseAngle)
00924                 ||  (chFound == chForwardSlash)
00925                 ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
00926                 {
00927                     //  Just fall through and process this attribute, though
00928                     //  the value will be "".
00929                 }
00930                 else if (chFound == chOpenAngle)
00931                 {
00932                     // Assume a malformed tag and that new one is starting
00933                     emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
00934                     return false;
00935                 }
00936                 else
00937                 {
00938                     // Something went really wrong
00939                     return false;
00940                 }
00941             }
00942 
00943             //  Add this attribute to the attribute list that we use to
00944             //  pass them to the handler. We reuse its existing elements
00945             //  but expand it as required.
00946             XMLAttr* curAtt;
00947             if (attCount >= curAttListSize)
00948             {
00949                 curAtt = new (fMemoryManager) XMLAttr
00950                 (
00951                     0
00952                     , attNameRawBuf
00953                     , XMLUni::fgZeroLenString
00954                     , fAttValueBuf.getRawBuffer()
00955                     , XMLAttDef::CData
00956                     , true
00957                     , fMemoryManager
00958                 );
00959                 fAttrList->addElement(curAtt);
00960                 fAttrNameHashList->addElement(attNameHash);
00961             }
00962             else
00963             {
00964                 curAtt = fAttrList->elementAt(attCount);
00965                 curAtt->set
00966                 (
00967                     0
00968                     , attNameRawBuf
00969                     , XMLUni::fgZeroLenString
00970                     , fAttValueBuf.getRawBuffer()
00971                 );
00972                 curAtt->setSpecified(true);
00973                 fAttrNameHashList->setElementAt(attNameHash, attCount);
00974             }
00975             attCount++;
00976 
00977             // And jump back to the top of the loop
00978             continue;
00979         }
00980 
00981         //  It was some special case character so do all of the checks and
00982         //  deal with it.
00983         if (!nextCh)
00984             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
00985 
00986         if (nextCh == chForwardSlash)
00987         {
00988             fReaderMgr.getNextChar();
00989             isEmpty = true;
00990             if (!fReaderMgr.skippedChar(chCloseAngle))
00991                 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
00992             break;
00993         }
00994         else if (nextCh == chCloseAngle)
00995         {
00996             fReaderMgr.getNextChar();
00997             break;
00998         }
00999         else if (nextCh == chOpenAngle)
01000         {
01001             //  Check for this one specially, since its going to be common
01002             //  and it is kind of auto-recovering since we've already hit the
01003             //  next open bracket, which is what we would have seeked to (and
01004             //  skipped this whole tag.)
01005             emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
01006             break;
01007         }
01008         else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
01009         {
01010             //  Check for this one specially, which is probably a missing
01011             //  attribute name, e.g. ="value". Just issue expected name
01012             //  error and eat the quoted string, then jump back to the
01013             //  top again.
01014             emitError(XMLErrs::ExpectedAttrName);
01015             fReaderMgr.getNextChar();
01016             fReaderMgr.skipQuotedString(nextCh);
01017             fReaderMgr.skipPastSpaces();
01018             continue;
01019         }
01020     }
01021 
01022     //  If empty, validate content right now if we are validating and then
01023     //  pop the element stack top. Else, we have to update the current stack
01024     //  top's namespace mapping elements.
01025     if (isEmpty)
01026     {
01027         // Pop the element stack back off since it'll never be used now
01028         fElemStack.popTop();
01029 
01030         // If the elem stack is empty, then it was an empty root
01031         if (isRoot)
01032             gotData = false;
01033     }
01034 
01035     //  If we have a document handler, then tell it about this start tag. We
01036     //  don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
01037     //  any prefix since its just one big name if we are not doing namespaces.
01038     if (fDocHandler)
01039     {
01040         fDocHandler->startElement
01041         (
01042             *elemDecl
01043             , fEmptyNamespaceId
01044             , 0
01045             , *fAttrList
01046             , attCount
01047             , isEmpty
01048             , isRoot
01049         );
01050     }
01051 
01052     return true;
01053 }
01054 
01055 
01056 //  This method is called to scan a start tag when we are processing
01057 //  namespaces. There are two different versions of this method, one for
01058 //  namespace aware processing an done for non-namespace aware processing.
01059 //
01060 //  This method is called after we've scanned the < of a start tag. So we
01061 //  have to get the element name, then scan the attributes, after which
01062 //  we are either going to see >, />, or attributes followed by one of those
01063 //  sequences.
01064 bool WFXMLScanner::scanStartTagNS(bool& gotData)
01065 {
01066     //  Assume we will still have data until proven otherwise. It will only
01067     //  ever be false if this is the root and its empty.
01068     gotData = true;
01069 
01070     //  The current position is after the open bracket, so we need to read in
01071     //  in the element name.
01072     int colonPosition;
01073     if (!fReaderMgr.getQName(fQNameBuf, &colonPosition))
01074     {        
01075         if (fQNameBuf.isEmpty())
01076             emitError(XMLErrs::ExpectedElementName);
01077         else
01078             emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer());
01079         fReaderMgr.skipToChar(chOpenAngle);
01080         return false;
01081     }
01082 
01083     // See if its the root element
01084     const bool isRoot = fElemStack.isEmpty();
01085 
01086         // Assume it won't be an empty tag
01087     bool isEmpty = false;
01088 
01089     // Skip any whitespace after the name
01090     fReaderMgr.skipPastSpaces();
01091 
01092     //  Lets try to look up the element
01093     const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
01094     XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
01095 
01096     if (!elemDecl) {
01097         if (!XMLString::compareNString(qnameRawBuf, XMLUni::fgXMLNSColonString, 6))
01098             emitError(XMLErrs::NoXMLNSAsElementPrefix, qnameRawBuf);
01099 
01100         if (fElementIndex < fElements->size()) {
01101             elemDecl = fElements->elementAt(fElementIndex);
01102         }
01103         else {
01104             elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
01105             (
01106                 fGrammarPoolMemoryManager
01107             );
01108             fElements->addElement(elemDecl);
01109         }
01110 
01111         elemDecl->setElementName(qnameRawBuf, fEmptyNamespaceId);
01112         fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
01113         fElementIndex++;
01114     }
01115 
01116     // Expand the element stack and add the new element
01117     fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
01118 
01119     // reset NS attribute list
01120     fAttrNSList->removeAllElements();
01121 
01122     // We loop until we either see a /> or >, handling attribute/value
01123     // pairs until we get there.
01124     XMLSize_t attCount = 0;
01125     XMLSize_t curAttListSize = fAttrList->size();
01126     while (true)
01127     {
01128         // And get the next non-space character
01129         XMLCh nextCh = fReaderMgr.peekNextChar();
01130 
01131         //  If the next character is not a slash or closed angle bracket,
01132         //  then it must be whitespace, since whitespace is required
01133         //  between the end of the last attribute and the name of the next
01134         //  one.
01135         if (attCount)
01136         {
01137             if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
01138             {
01139                 bool bFoundSpace;
01140                 fReaderMgr.skipPastSpaces(bFoundSpace);
01141                 if (!bFoundSpace)
01142                 {
01143                     // Emit the error but keep on going
01144                     emitError(XMLErrs::ExpectedWhitespace);
01145                 }
01146                 // Ok, peek another char
01147                 nextCh = fReaderMgr.peekNextChar();
01148             }
01149         }
01150 
01151         //  Ok, here we first check for any of the special case characters.
01152         //  If its not one, then we do the normal case processing, which
01153         //  assumes that we've hit an attribute value, Otherwise, we do all
01154         //  the special case checks.
01155         if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh))
01156         {
01157             //  Assume its going to be an attribute, so get a name from
01158             //  the input.
01159             int colonPosition;
01160             if (!fReaderMgr.getQName(fAttNameBuf, &colonPosition))
01161             {                
01162                 if (fAttNameBuf.isEmpty())
01163                     emitError(XMLErrs::ExpectedAttrName);
01164                 else
01165                     emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); 
01166                 fReaderMgr.skipPastChar(chCloseAngle);
01167                 return false;
01168             }
01169 
01170             // And next must be an equal sign
01171             if (!scanEq())
01172             {
01173                 static const XMLCh tmpList[] =
01174                 {
01175                     chSingleQuote, chDoubleQuote, chCloseAngle
01176                     , chOpenAngle, chForwardSlash, chNull
01177                 };
01178 
01179                 emitError(XMLErrs::ExpectedEqSign);
01180 
01181                 //  Try to sync back up by skipping forward until we either
01182                 //  hit something meaningful.
01183                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
01184 
01185                 if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
01186                 {
01187                     // Jump back to top for normal processing of these
01188                     continue;
01189                 }
01190                 else if ((chFound == chSingleQuote)
01191                       ||  (chFound == chDoubleQuote)
01192                       ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
01193                 {
01194                     // Just fall through assuming that the value is to follow
01195                 }
01196                 else if (chFound == chOpenAngle)
01197                 {
01198                     // Assume a malformed tag and that new one is starting
01199                     emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
01200                     return false;
01201                 }
01202                 else
01203                 {
01204                     // Something went really wrong
01205                     return false;
01206                 }
01207             }
01208 
01209             //  See if this attribute is declared more than one for this element.
01210             const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer();
01211             XMLSize_t attNameHash = XMLString::hash(attNameRawBuf, 109);
01212             if (attCount) {
01213 
01214                 for (XMLSize_t k=0; k < attCount; k++) {
01215 
01216                     if (fAttrNameHashList->elementAt(k) == attNameHash) {
01217                         if (XMLString::equals(
01218                                 fAttrList->elementAt(k)->getQName()
01219                                 , attNameRawBuf))
01220                         {
01221                             emitError
01222                             (
01223                                 XMLErrs::AttrAlreadyUsedInSTag
01224                                 , attNameRawBuf
01225                                 , qnameRawBuf
01226                             );
01227                             break;
01228                         }
01229                     }
01230                 }
01231             }
01232 
01233             //  Skip any whitespace before the value and then scan the att
01234             //  value. This will come back normalized with entity refs and
01235             //  char refs expanded.
01236             fReaderMgr.skipPastSpaces();
01237             if (!scanAttValue(attNameRawBuf, fAttValueBuf))
01238             {
01239                 static const XMLCh tmpList[] =
01240                 {
01241                     chCloseAngle, chOpenAngle, chForwardSlash, chNull
01242                 };
01243 
01244                 emitError(XMLErrs::ExpectedAttrValue);
01245 
01246                 //  It failed, so lets try to get synced back up. We skip
01247                 //  forward until we find some whitespace or one of the
01248                 //  chars in our list.
01249                 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
01250 
01251                 if ((chFound == chCloseAngle)
01252                 ||  (chFound == chForwardSlash)
01253                 ||  fReaderMgr.getCurrentReader()->isWhitespace(chFound))
01254                 {
01255                     //  Just fall through and process this attribute, though
01256                     //  the value will be "".
01257                 }
01258                 else if (chFound == chOpenAngle)
01259                 {
01260                     // Assume a malformed tag and that new one is starting
01261                     emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
01262                     return false;
01263                 }
01264                 else
01265                 {
01266                     // Something went really wrong
01267                     return false;
01268                 }
01269             }
01270 
01271             //  Add this attribute to the attribute list that we use to
01272             //  pass them to the handler. We reuse its existing elements
01273             //  but expand it as required.
01274             const XMLCh* attValueRawBuf = fAttValueBuf.getRawBuffer();
01275             XMLAttr* curAtt = 0;
01276             if (attCount >= curAttListSize)
01277             {
01278                 curAtt = new (fMemoryManager) XMLAttr
01279                 (
01280                     fEmptyNamespaceId
01281                     , attNameRawBuf
01282                     , attValueRawBuf
01283                     , XMLAttDef::CData
01284                     , true
01285                     , fMemoryManager
01286                 );
01287                 fAttrList->addElement(curAtt);
01288                 fAttrNameHashList->addElement(attNameHash);
01289             }
01290             else
01291             {
01292                 curAtt = fAttrList->elementAt(attCount);
01293                 curAtt->set
01294                 (
01295                     fEmptyNamespaceId
01296                     , attNameRawBuf
01297                     , attValueRawBuf
01298                 );
01299                 curAtt->setSpecified(true);
01300                 fAttrNameHashList->setElementAt(attNameHash, attCount);
01301             }
01302 
01303             // Map prefix to namespace
01304             const XMLCh* attPrefix = curAtt->getPrefix();
01305             const XMLCh* attLocalName = curAtt->getName();
01306             const XMLCh* namespaceURI = fAttValueBuf.getRawBuffer();
01307 
01308             if (attPrefix && *attPrefix) {
01309                 if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
01310                     curAtt->setURIId(fXMLNamespaceId);
01311                 }
01312                 else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
01313 
01314                     if (XMLString::equals(attLocalName, XMLUni::fgXMLNSString))
01315                         emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
01316                     else if (XMLString::equals(attLocalName, XMLUni::fgXMLString)) {
01317                         if (!XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
01318                             emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
01319                     }
01320 
01321                     if (!namespaceURI)
01322                         emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
01323                     else if(!*namespaceURI && fXMLVersion == XMLReader::XMLV1_0)
01324                         emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
01325 
01326                     fElemStack.addPrefix
01327                     (
01328                         attLocalName
01329                         , fURIStringPool->addOrFind(namespaceURI)
01330                     );
01331                     curAtt->setURIId(fXMLNSNamespaceId);
01332                 }
01333                 else {
01334                     fAttrNSList->addElement(curAtt);
01335                 }
01336             }
01337             else {
01338                 if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) {
01339 
01340                     if (XMLString::equals(namespaceURI, XMLUni::fgXMLNSURIName))
01341                         emitError(XMLErrs::NoUseOfxmlnsURI);
01342                     else if (XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
01343                         emitError(XMLErrs::XMLURINotMatchXMLPrefix);
01344 
01345                     fElemStack.addPrefix
01346                     (
01347                         XMLUni::fgZeroLenString
01348                         , fURIStringPool->addOrFind(namespaceURI)
01349                     );
01350                 }
01351             }
01352 
01353             // increment attribute count
01354             attCount++;
01355 
01356             // And jump back to the top of the loop
01357             continue;
01358         }
01359 
01360         //  It was some special case character so do all of the checks and
01361         //  deal with it.
01362         if (!nextCh)
01363             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
01364 
01365         if (nextCh == chForwardSlash)
01366         {
01367             fReaderMgr.getNextChar();
01368             isEmpty = true;
01369             if (!fReaderMgr.skippedChar(chCloseAngle))
01370                 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
01371             break;
01372         }
01373         else if (nextCh == chCloseAngle)
01374         {
01375             fReaderMgr.getNextChar();
01376             break;
01377         }
01378         else if (nextCh == chOpenAngle)
01379         {
01380             //  Check for this one specially, since its going to be common
01381             //  and it is kind of auto-recovering since we've already hit the
01382             //  next open bracket, which is what we would have seeked to (and
01383             //  skipped this whole tag.)
01384             emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
01385             break;
01386         }
01387         else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
01388         {
01389             //  Check for this one specially, which is probably a missing
01390             //  attribute name, e.g. ="value". Just issue expected name
01391             //  error and eat the quoted string, then jump back to the
01392             //  top again.
01393             emitError(XMLErrs::ExpectedAttrName);
01394             fReaderMgr.getNextChar();
01395             fReaderMgr.skipQuotedString(nextCh);
01396             fReaderMgr.skipPastSpaces();
01397             continue;
01398         }
01399     }
01400 
01401     // Handle provided attributes that we did not map their prefixes
01402     for (unsigned int i=0; i < fAttrNSList->size(); i++) {
01403 
01404         XMLAttr* providedAttr = fAttrNSList->elementAt(i);
01405 
01406         providedAttr->setURIId
01407         (
01408                 resolvePrefix
01409             (
01410                 providedAttr->getPrefix(),
01411                 ElemStack::Mode_Attribute
01412             )
01413         );
01414     }
01415 
01416     if(attCount) {
01417 
01418         //
01419         // Decide if to use hash table to do duplicate checking
01420         //
01421         bool toUseHashTable = false;
01422         setAttrDupChkRegistry(attCount, toUseHashTable);
01423 
01424         // check for duplicate namespace attributes:
01425         // by checking for qualified names with the same local part and with prefixes 
01426         // which have been bound to namespace names that are identical. 
01427         XMLAttr* loopAttr;
01428         XMLAttr* curAtt;
01429         for (unsigned int attrIndex=0; attrIndex < attCount-1; attrIndex++) {
01430             loopAttr = fAttrList->elementAt(attrIndex);
01431 
01432             if (!toUseHashTable)
01433             {
01434                 for (unsigned int curAttrIndex = attrIndex+1; curAttrIndex < attCount; curAttrIndex++) {
01435                     curAtt = fAttrList->elementAt(curAttrIndex);
01436                     if (curAtt->getURIId() == loopAttr->getURIId() &&
01437                         XMLString::equals(curAtt->getName(), loopAttr->getName())) {
01438                         emitError
01439                             ( 
01440                             XMLErrs::AttrAlreadyUsedInSTag
01441                             , curAtt->getName()
01442                             , elemDecl->getFullName()
01443                             );
01444                     }
01445                 }
01446             }
01447             else 
01448             {
01449                 if (fAttrDupChkRegistry->containsKey((void*)loopAttr->getName(), loopAttr->getURIId()))
01450                 {
01451                     emitError
01452                     ( 
01453                     XMLErrs::AttrAlreadyUsedInSTag
01454                     , loopAttr->getName()
01455                     , elemDecl->getFullName()
01456                     );
01457                 }
01458 
01459                 fAttrDupChkRegistry->put((void*)loopAttr->getName(), loopAttr->getURIId(), loopAttr);
01460             }
01461         }  
01462     }
01463 
01464     // Resolve the qualified name to a URI.
01465     unsigned int uriId = resolvePrefix
01466     (
01467         elemDecl->getElementName()->getPrefix()
01468         , ElemStack::Mode_Element
01469     );
01470 
01471     // Now we can update the element stack
01472     fElemStack.setCurrentURI(uriId);
01473 
01474     // Tell the document handler about this start tag
01475     if (fDocHandler)
01476     {
01477         fDocHandler->startElement
01478         (
01479             *elemDecl
01480             , uriId
01481             , elemDecl->getElementName()->getPrefix()
01482             , *fAttrList
01483             , attCount
01484             , isEmpty
01485             , isRoot
01486         );
01487     }
01488 
01489     //  If empty, validate content right now if we are validating and then
01490     //  pop the element stack top. Else, we have to update the current stack
01491     //  top's namespace mapping elements.
01492     if (isEmpty)
01493     {
01494         // Pop the element stack back off since it'll never be used now
01495         fElemStack.popTop();
01496 
01497         // If the elem stack is empty, then it was an empty root
01498         if (isRoot)
01499             gotData = false;
01500     }
01501 
01502     return true;
01503 }
01504 
01505 // ---------------------------------------------------------------------------
01506 //  XMLScanner: Private parsing methods
01507 // ---------------------------------------------------------------------------
01508 bool WFXMLScanner::scanAttValue(const XMLCh* const attrName
01509                               ,     XMLBuffer&   toFill)
01510 {
01511     // Reset the target buffer
01512     toFill.reset();
01513 
01514     // Get the next char which must be a single or double quote
01515     XMLCh quoteCh;
01516     if (!fReaderMgr.skipIfQuote(quoteCh))
01517         return false;
01518 
01519     //  We have to get the current reader because we have to ignore closing
01520     //  quotes until we hit the same reader again.
01521     const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
01522 
01523     //  Loop until we get the attribute value. Note that we use a double
01524     //  loop here to avoid the setup/teardown overhead of the exception
01525     //  handler on every round.
01526     XMLCh   nextCh;
01527     XMLCh   secondCh = 0;
01528     bool    gotLeadingSurrogate = false;
01529     bool    escaped;
01530     while (true)
01531     {
01532     try
01533     {
01534         while(true)
01535         {
01536             nextCh = fReaderMgr.getNextChar();
01537 
01538             if (!nextCh)
01539                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
01540 
01541             // Check for our ending quote in the same entity
01542             if (nextCh == quoteCh)
01543             {
01544                 if (curReader == fReaderMgr.getCurrentReaderNum())
01545                     return true;
01546 
01547                 // Watch for spillover into a previous entity
01548                 if (curReader > fReaderMgr.getCurrentReaderNum())
01549                 {
01550                     emitError(XMLErrs::PartialMarkupInEntity);
01551                     return false;
01552                 }
01553             }
01554 
01555             //  Check for an entity ref now, before we let it affect our
01556             //  whitespace normalization logic below. We ignore the empty flag
01557             //  in this one.
01558             escaped = false;
01559             if (nextCh == chAmpersand)
01560             {
01561                 if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
01562                 {
01563                     gotLeadingSurrogate = false;
01564                     continue;
01565                 }
01566             }
01567             else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
01568             {
01569                 // Deal with surrogate pairs
01570                 //  Its a leading surrogate. If we already got one, then
01571                 //  issue an error, else set leading flag to make sure that
01572                 //  we look for a trailing next time.
01573                 if (gotLeadingSurrogate)
01574                 {
01575                     emitError(XMLErrs::Expected2ndSurrogateChar);
01576                 }
01577                 else
01578                     gotLeadingSurrogate = true;
01579             }
01580             else
01581             {
01582                 //  If its a trailing surrogate, make sure that we are
01583                 //  prepared for that. Else, its just a regular char so make
01584                 //  sure that we were not expected a trailing surrogate.
01585                 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
01586                 {
01587                     // Its trailing, so make sure we were expecting it
01588                     if (!gotLeadingSurrogate)
01589                         emitError(XMLErrs::Unexpected2ndSurrogateChar);
01590                 }
01591                 else
01592                 {
01593                     //  Its just a char, so make sure we were not expecting a
01594                     //  trailing surrogate.
01595                     if (gotLeadingSurrogate) {
01596                         emitError(XMLErrs::Expected2ndSurrogateChar);
01597                     }
01598                     // Its got to at least be a valid XML character
01599                     else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
01600                     {
01601                         XMLCh tmpBuf[9];
01602                         XMLString::binToText
01603                         (
01604                             nextCh
01605                             , tmpBuf
01606                             , 8
01607                             , 16
01608                             , fMemoryManager
01609                         );
01610                         emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
01611                     }
01612                 }
01613                 gotLeadingSurrogate = false;
01614             }
01615 
01616             //  If its not escaped, then make sure its not a < character, which
01617             //  is not allowed in attribute values.
01618             if (!escaped) {
01619                 if (nextCh == chOpenAngle)
01620                     emitError(XMLErrs::BracketInAttrValue, attrName);
01621                 else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
01622                     nextCh = chSpace;
01623             }
01624 
01625             // Else add it to the buffer
01626             toFill.append(nextCh);
01627 
01628             if (secondCh)
01629             {
01630                 toFill.append(secondCh);
01631                 secondCh=0;
01632             }
01633         }
01634     }
01635     catch(const EndOfEntityException&)
01636     {
01637         // Just eat it and continue.
01638         gotLeadingSurrogate = false;
01639         escaped = false;
01640     }
01641     }
01642     return true;
01643 }
01644 
01645 
01646 //  This method scans a CDATA section. It collects the character into one
01647 //  of the temp buffers and calls the document handler, if any, with the
01648 //  characters. It assumes that the <![CDATA string has been scanned before
01649 //  this call.
01650 void WFXMLScanner::scanCDSection()
01651 {
01652     static const XMLCh CDataClose[] =
01653     {
01654             chCloseSquare, chCloseAngle, chNull
01655     };
01656 
01657     //  The next character should be the opening square bracket. If not
01658     //  issue an error, but then try to recover by skipping any whitespace
01659     //  and checking again.
01660     if (!fReaderMgr.skippedChar(chOpenSquare))
01661     {
01662         emitError(XMLErrs::ExpectedOpenSquareBracket);
01663         fReaderMgr.skipPastSpaces();
01664 
01665         // If we still don't find it, then give up, else keep going
01666         if (!fReaderMgr.skippedChar(chOpenSquare))
01667             return;
01668     }
01669 
01670     // Get a buffer for this
01671     XMLBufBid bbCData(&fBufMgr);
01672 
01673     //  We just scan forward until we hit the end of CDATA section sequence.
01674     //  CDATA is effectively a big escape mechanism so we don't treat markup
01675     //  characters specially here.
01676     bool            emittedError = false;
01677     bool    gotLeadingSurrogate = false;
01678     while (true)
01679     {
01680         const XMLCh nextCh = fReaderMgr.getNextChar();
01681 
01682         // Watch for unexpected end of file
01683         if (!nextCh)
01684         {
01685             emitError(XMLErrs::UnterminatedCDATASection);
01686             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
01687         }
01688 
01689         //  If this is a close square bracket it could be our closing
01690         //  sequence.
01691         if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
01692         {
01693             //  make sure we were not expecting a trailing surrogate.
01694             if (gotLeadingSurrogate)
01695                 emitError(XMLErrs::Expected2ndSurrogateChar);
01696 
01697             // If we have a doc handler, call it
01698             if (fDocHandler)
01699             {
01700                 fDocHandler->docCharacters
01701                 (
01702                     bbCData.getRawBuffer()
01703                     , bbCData.getLen()
01704                     , true
01705                 );
01706             }
01707 
01708             // And we are done
01709             break;
01710         }
01711 
01712         //  Make sure its a valid character. But if we've emitted an error
01713         //  already, don't bother with the overhead since we've already told
01714         //  them about it.
01715         if (!emittedError)
01716         {
01717             // Deal with surrogate pairs
01718             if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
01719             {
01720                 //  Its a leading surrogate. If we already got one, then
01721                 //  issue an error, else set leading flag to make sure that
01722                 //  we look for a trailing next time.
01723                 if (gotLeadingSurrogate)
01724                     emitError(XMLErrs::Expected2ndSurrogateChar);
01725                 else
01726                     gotLeadingSurrogate = true;
01727             }
01728             else
01729             {
01730                 //  If its a trailing surrogate, make sure that we are
01731                 //  prepared for that. Else, its just a regular char so make
01732                 //  sure that we were not expected a trailing surrogate.
01733                 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
01734                 {
01735                     // Its trailing, so make sure we were expecting it
01736                     if (!gotLeadingSurrogate)
01737                         emitError(XMLErrs::Unexpected2ndSurrogateChar);
01738                 }
01739                 else
01740                 {
01741                     //  Its just a char, so make sure we were not expecting a
01742                     //  trailing surrogate.
01743                     if (gotLeadingSurrogate)
01744                         emitError(XMLErrs::Expected2ndSurrogateChar);
01745 
01746                     // Its got to at least be a valid XML character
01747                     else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
01748                     {
01749                         XMLCh tmpBuf[9];
01750                         XMLString::binToText
01751                         (
01752                             nextCh
01753                             , tmpBuf
01754                             , 8
01755                             , 16
01756                             , fMemoryManager
01757                         );
01758                         emitError(XMLErrs::InvalidCharacter, tmpBuf);
01759                         emittedError = true;
01760                     }
01761                 }
01762                 gotLeadingSurrogate = false;
01763             }
01764         }
01765 
01766         // Add it to the buffer
01767         bbCData.append(nextCh);
01768     }
01769 }
01770 
01771 
01772 void WFXMLScanner::scanCharData(XMLBuffer& toUse)
01773 {
01774     //  We have to watch for the stupid ]]> sequence, which is illegal in
01775     //  character data. So this is a little state machine that handles that.
01776     enum States
01777     {
01778         State_Waiting
01779         , State_GotOne
01780         , State_GotTwo
01781     };
01782 
01783     // Reset the buffer before we start
01784     toUse.reset();
01785 
01786     // Turn on the 'throw at end' flag of the reader manager
01787     ThrowEOEJanitor jan(&fReaderMgr, true);
01788 
01789     //  In order to be more efficient we have to use kind of a deeply nested
01790     //  set of blocks here. The outer block puts on a try and catches end of
01791     //  entity exceptions. The inner loop is the per-character loop. If we
01792     //  put the try inside the inner loop, it would work but would require
01793     //  the exception handling code setup/teardown code to be invoked for
01794     //  each character.
01795     XMLCh   nextCh;
01796     XMLCh   secondCh = 0;
01797     States  curState = State_Waiting;
01798     bool    escaped = false;
01799     bool    gotLeadingSurrogate = false;
01800     bool    notDone = true;
01801     while (notDone)
01802     {
01803         try
01804         {
01805             while (true)
01806             {
01807                 //  Eat through as many plain content characters as possible without
01808                 //  needing special handling.  Moving most content characters here,
01809                 //  in this one call, rather than running the overall loop once
01810                 //  per content character, is a speed optimization.
01811                 if (curState == State_Waiting  &&  !gotLeadingSurrogate)
01812                 {
01813                      fReaderMgr.movePlainContentChars(toUse);
01814                 }
01815 
01816                 // Try to get another char from the source
01817                 //   The code from here on down covers all contengencies,
01818                 if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
01819                 {
01820                     // If we were waiting for a trailing surrogate, its an error
01821                     if (gotLeadingSurrogate)
01822                         emitError(XMLErrs::Expected2ndSurrogateChar);
01823 
01824                     notDone = false;
01825                     break;
01826                 }
01827 
01828                 //  Watch for a reference. Note that the escapement mechanism
01829                 //  is ignored in this content.
01830                 escaped = false;
01831                 if (nextCh == chAmpersand)
01832                 {
01833                     sendCharData(toUse);
01834 
01835                     // Turn off the throwing at the end of entity during this
01836                     ThrowEOEJanitor jan(&fReaderMgr, false);
01837 
01838                     if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
01839                     {
01840                         gotLeadingSurrogate = false;
01841                         continue;
01842                     }
01843                 }
01844                 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
01845                 {
01846                     // Deal with surrogate pairs
01847                     //  Its a leading surrogate. If we already got one, then
01848                     //  issue an error, else set leading flag to make sure that
01849                     //  we look for a trailing next time.
01850                     if (gotLeadingSurrogate)
01851                     {
01852                         emitError(XMLErrs::Expected2ndSurrogateChar);
01853                     }
01854                     else
01855                         gotLeadingSurrogate = true;
01856                 }
01857                 else
01858                 {
01859                     //  If its a trailing surrogate, make sure that we are
01860                     //  prepared for that. Else, its just a regular char so make
01861                     //  sure that we were not expected a trailing surrogate.
01862                     if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
01863                     {
01864                         // Its trailing, so make sure we were expecting it
01865                         if (!gotLeadingSurrogate)
01866                             emitError(XMLErrs::Unexpected2ndSurrogateChar);
01867                     }
01868                     else
01869                     {
01870                         //  Its just a char, so make sure we were not expecting a
01871                         //  trailing surrogate.
01872                         if (gotLeadingSurrogate) {
01873                             emitError(XMLErrs::Expected2ndSurrogateChar);
01874                         }
01875                         // Its got to at least be a valid XML character
01876                         else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
01877                         {
01878                             XMLCh tmpBuf[9];
01879                             XMLString::binToText
01880                             (
01881                                 nextCh
01882                                 , tmpBuf
01883                                 , 8
01884                                 , 16
01885                                 , fMemoryManager
01886                             );
01887                             emitError(XMLErrs::InvalidCharacter, tmpBuf);
01888                         }
01889                     }
01890                     gotLeadingSurrogate = false;
01891                 }
01892 
01893                 // Keep the state machine up to date
01894                 if (!escaped)
01895                 {
01896                     if (nextCh == chCloseSquare)
01897                     {
01898                         if (curState == State_Waiting)
01899                             curState = State_GotOne;
01900                         else if (curState == State_GotOne)
01901                             curState = State_GotTwo;
01902                     }
01903                     else if (nextCh == chCloseAngle)
01904                     {
01905                         if (curState == State_GotTwo)
01906                             emitError(XMLErrs::BadSequenceInCharData);
01907                         curState = State_Waiting;
01908                     }
01909                     else
01910                     {
01911                         curState = State_Waiting;
01912                     }
01913                 }
01914                 else
01915                 {
01916                     curState = State_Waiting;
01917                 }
01918 
01919                 // Add this char to the buffer
01920                 toUse.append(nextCh);
01921 
01922                 if (secondCh)
01923                 {
01924                     toUse.append(secondCh);
01925                     secondCh=0;
01926                 }
01927             }
01928         }
01929         catch(const EndOfEntityException& toCatch)
01930         {
01931             //  Some entity ended, so we have to send any accumulated
01932             //  chars and send an end of entity event.
01933             sendCharData(toUse);
01934             gotLeadingSurrogate = false;
01935 
01936             if (fDocHandler)
01937                 fDocHandler->endEntityReference(toCatch.getEntity());
01938         }
01939     }
01940 
01941     // Send any char data that we accumulated into the buffer
01942     sendCharData(toUse);
01943 }
01944 
01945 InputSource* WFXMLScanner::resolveSystemId(const XMLCh* const /*sysId*/
01946                                           ,const XMLCh* const /*pubId*/)
01947 {
01948     return 0;
01949 }
01950 
01951 //  This method will scan a general/character entity ref. It will either
01952 //  expand a char ref and return it directly, or push a reader for a general
01953 //  entity.
01954 //
01955 //  The return value indicates whether the char parameters hold the value
01956 //  or whether the value was pushed as a reader, or that it failed.
01957 //
01958 //  The escaped flag tells the caller whether the returned parameter resulted
01959 //  from a character reference, which escapes the character in some cases. It
01960 //  only makes any difference if the return value indicates the value was
01961 //  returned directly.
01962 XMLScanner::EntityExpRes
01963 WFXMLScanner::scanEntityRef(const bool
01964                             ,     XMLCh&  firstCh
01965                             ,     XMLCh&  secondCh
01966                             ,     bool&   escaped)
01967 {
01968     // Assume no escape
01969     secondCh = 0;
01970     escaped = false;
01971 
01972     // We have to insure that its all in one entity
01973     const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum();
01974 
01975     //  If the next char is a pound, then its a character reference and we
01976     //  need to expand it always.
01977     if (fReaderMgr.skippedChar(chPound))
01978     {
01979         //  Its a character reference, so scan it and get back the numeric
01980         //  value it represents.
01981         if (!scanCharRef(firstCh, secondCh))
01982             return EntityExp_Failed;
01983 
01984         escaped = true;
01985 
01986         if (curReader != fReaderMgr.getCurrentReaderNum())
01987             emitError(XMLErrs::PartialMarkupInEntity);
01988 
01989         return EntityExp_Returned;
01990     }
01991 
01992     // Expand it since its a normal entity ref
01993     XMLBufBid bbName(&fBufMgr);
01994     if (!fReaderMgr.getName(bbName.getBuffer()))
01995     {
01996         emitError(XMLErrs::ExpectedEntityRefName);
01997         return EntityExp_Failed;
01998     }
01999 
02000     //  Next char must be a semi-colon. But if its not, just emit
02001     //  an error and try to continue.
02002     if (!fReaderMgr.skippedChar(chSemiColon))
02003         emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
02004 
02005     // Make sure we ended up on the same entity reader as the & char
02006     if (curReader != fReaderMgr.getCurrentReaderNum())
02007         emitError(XMLErrs::PartialMarkupInEntity);
02008 
02009     // Look up the name in the general entity pool
02010     // If it does not exist, then obviously an error
02011     if (!fEntityTable->containsKey(bbName.getRawBuffer()))
02012     {
02013         // XML 1.0 Section 4.1
02014         // Well-formedness Constraint for entity not found:
02015         //   In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
02016         //      or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
02017         //      or a parameter entity
02018         if (fStandalone || fHasNoDTD)
02019             emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
02020 
02021         return EntityExp_Failed;
02022     }
02023 
02024     // here's where we need to check if there's a SecurityManager,
02025     // how many entity references we've had
02026     if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
02027         XMLCh expLimStr[32];
02028         XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager);
02029         emitError
02030         ( 
02031             XMLErrs::EntityExpansionLimitExceeded
02032             , expLimStr
02033         );
02034         // there seems nothing better to be done than to reset the entity expansion counter
02035         fEntityExpansionCount = 0;
02036     }
02037 
02038     firstCh = fEntityTable->get(bbName.getRawBuffer());
02039     escaped = true;
02040     return EntityExp_Returned;
02041 }
02042 
02043 // ---------------------------------------------------------------------------
02044 //  WFXMLScanner: Grammar preparsing
02045 // ---------------------------------------------------------------------------
02046 Grammar* WFXMLScanner::loadGrammar(const   InputSource&
02047                                    , const short
02048                                    , const bool)
02049 {
02050     // REVISIT: emit a warning or throw an exception
02051     return 0;
02052 }
02053 
02054 
02055 XERCES_CPP_NAMESPACE_END