GME  13
XMLScanner.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  *
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: XMLScanner.cpp 882548 2009-11-20 13:44:14Z borisk $
00020  */
00021 
00022 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/internal/XMLScanner.hpp>
00027 #include <xercesc/internal/ValidationContextImpl.hpp>
00028 #include <xercesc/util/Janitor.hpp>
00029 #include <xercesc/util/Mutexes.hpp>
00030 #include <xercesc/util/RuntimeException.hpp>
00031 #include <xercesc/util/UnexpectedEOFException.hpp>
00032 #include <xercesc/util/XMLMsgLoader.hpp>
00033 #include <xercesc/util/XMLInitializer.hpp>
00034 #include <xercesc/framework/LocalFileInputSource.hpp>
00035 #include <xercesc/framework/URLInputSource.hpp>
00036 #include <xercesc/framework/XMLDocumentHandler.hpp>
00037 #include <xercesc/framework/XMLEntityHandler.hpp>
00038 #include <xercesc/framework/XMLPScanToken.hpp>
00039 #include <xercesc/framework/XMLValidator.hpp>
00040 #include <xercesc/internal/EndOfEntityException.hpp>
00041 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
00042 #include <xercesc/validators/common/GrammarResolver.hpp>
00043 #include <xercesc/util/OutOfMemoryException.hpp>
00044 #include <xercesc/util/XMLResourceIdentifier.hpp>
00045 
00046 XERCES_CPP_NAMESPACE_BEGIN
00047 
00048 // ---------------------------------------------------------------------------
00049 //  Local static data
00050 // ---------------------------------------------------------------------------
00051 static XMLUInt32       gScannerId = 0;
00052 static XMLMutex*       sScannerMutex = 0;
00053 static XMLMsgLoader*   gMsgLoader = 0;
00054 
00055 void XMLInitializer::initializeXMLScanner()
00056 {
00057     gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
00058 
00059     if (!gMsgLoader)
00060       XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain);
00061 
00062     sScannerMutex = new XMLMutex(XMLPlatformUtils::fgMemoryManager);
00063 }
00064 
00065 void XMLInitializer::terminateXMLScanner()
00066 {
00067     delete gMsgLoader;
00068     gMsgLoader = 0;
00069 
00070     delete sScannerMutex;
00071     sScannerMutex = 0;
00072 }
00073 
00074 //
00075 //
00076 typedef JanitorMemFunCall<XMLScanner>   CleanupType;
00077 typedef JanitorMemFunCall<ReaderMgr>    ReaderMgrResetType;
00078 
00079 
00080 // ---------------------------------------------------------------------------
00081 //  XMLScanner: Constructors and Destructor
00082 // ---------------------------------------------------------------------------
00083 XMLScanner::XMLScanner(XMLValidator* const valToAdopt,
00084                        GrammarResolver* const grammarResolver,
00085                        MemoryManager* const manager)
00086     : fBufferSize(1024 * 1024)
00087     , fLowWaterMark (100)
00088     , fStandardUriConformant(false)
00089     , fCalculateSrcOfs(false)
00090     , fDoNamespaces(false)
00091     , fExitOnFirstFatal(true)
00092     , fValidationConstraintFatal(false)
00093     , fInException(false)
00094     , fStandalone(false)
00095     , fHasNoDTD(true)
00096     , fValidate(false)
00097     , fValidatorFromUser(false)
00098     , fDoSchema(false)
00099     , fSchemaFullChecking(false)
00100     , fIdentityConstraintChecking(true)
00101     , fToCacheGrammar(false)
00102     , fUseCachedGrammar(false)
00103     , fLoadExternalDTD(true)
00104     , fLoadSchema(true)
00105     , fNormalizeData(true)
00106     , fGenerateSyntheticAnnotations(false)
00107     , fValidateAnnotations(false)
00108     , fIgnoreCachedDTD(false)
00109     , fIgnoreAnnotations(false)
00110     , fDisableDefaultEntityResolution(false)
00111     , fSkipDTDValidation(false)
00112     , fHandleMultipleImports(false)
00113     , fErrorCount(0)
00114     , fEntityExpansionLimit(0)
00115     , fEntityExpansionCount(0)
00116     , fEmptyNamespaceId(0)
00117     , fUnknownNamespaceId(0)
00118     , fXMLNamespaceId(0)
00119     , fXMLNSNamespaceId(0)
00120     , fSchemaNamespaceId(0)
00121     , fUIntPool(0)
00122     , fUIntPoolRow(0)
00123     , fUIntPoolCol(0)
00124     , fUIntPoolRowTotal(2)
00125     , fScannerId(0)
00126     , fSequenceId(0)
00127     , fAttrList(0)
00128     , fAttrDupChkRegistry(0)
00129     , fDocHandler(0)
00130     , fDocTypeHandler(0)
00131     , fEntityHandler(0)
00132     , fErrorReporter(0)
00133     , fErrorHandler(0)
00134     , fPSVIHandler(0)
00135     , fValidationContext(0)
00136     , fEntityDeclPoolRetrieved(false)
00137     , fReaderMgr(manager)
00138     , fValidator(valToAdopt)
00139     , fValScheme(Val_Never)
00140     , fGrammarResolver(grammarResolver)
00141     , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
00142     , fGrammar(0)
00143     , fRootGrammar(0)
00144     , fURIStringPool(0)
00145     , fRootElemName(0)
00146     , fExternalSchemaLocation(0)
00147     , fExternalNoNamespaceSchemaLocation(0)
00148     , fSecurityManager(0)
00149     , fXMLVersion(XMLReader::XMLV1_0)
00150     , fMemoryManager(manager)
00151     , fBufMgr(manager)
00152     , fAttNameBuf(1023, manager)
00153     , fAttValueBuf(1023, manager)
00154     , fCDataBuf(1023, manager)
00155     , fQNameBuf(1023, manager)
00156     , fPrefixBuf(1023, manager)
00157     , fURIBuf(1023, manager)
00158     , fWSNormalizeBuf(1023, manager)
00159     , fElemStack(manager)
00160 {
00161     CleanupType cleanup(this, &XMLScanner::cleanUp);
00162 
00163     try
00164     {
00165         commonInit();
00166     }
00167     catch(const OutOfMemoryException&)
00168     {
00169         // Don't cleanup when out of memory, since executing the
00170         // code can cause problems.
00171         cleanup.release();
00172 
00173         throw;
00174     }
00175 
00176     cleanup.release();
00177 }
00178 
00179 XMLScanner::XMLScanner( XMLDocumentHandler* const  docHandler
00180                           , DocTypeHandler* const    docTypeHandler
00181                           , XMLEntityHandler* const  entityHandler
00182                           , XMLErrorReporter* const  errHandler
00183                           , XMLValidator* const      valToAdopt
00184                           , GrammarResolver* const   grammarResolver
00185                           , MemoryManager* const     manager)
00186 
00187     : fBufferSize(1024 * 1024)
00188     , fLowWaterMark (100)
00189     , fStandardUriConformant(false)
00190     , fCalculateSrcOfs(false)
00191     , fDoNamespaces(false)
00192     , fExitOnFirstFatal(true)
00193     , fValidationConstraintFatal(false)
00194     , fInException(false)
00195     , fStandalone(false)
00196     , fHasNoDTD(true)
00197     , fValidate(false)
00198     , fValidatorFromUser(false)
00199     , fDoSchema(false)
00200     , fSchemaFullChecking(false)
00201     , fIdentityConstraintChecking(true)
00202     , fToCacheGrammar(false)
00203     , fUseCachedGrammar(false)
00204         , fLoadExternalDTD(true)
00205     , fLoadSchema(true)
00206     , fNormalizeData(true)
00207     , fGenerateSyntheticAnnotations(false)
00208     , fValidateAnnotations(false)
00209     , fIgnoreCachedDTD(false)
00210     , fIgnoreAnnotations(false)
00211     , fDisableDefaultEntityResolution(false)
00212     , fSkipDTDValidation(false)
00213     , fHandleMultipleImports(false)
00214     , fErrorCount(0)
00215     , fEntityExpansionLimit(0)
00216     , fEntityExpansionCount(0)
00217     , fEmptyNamespaceId(0)
00218     , fUnknownNamespaceId(0)
00219     , fXMLNamespaceId(0)
00220     , fXMLNSNamespaceId(0)
00221     , fSchemaNamespaceId(0)
00222     , fUIntPool(0)
00223     , fUIntPoolRow(0)
00224     , fUIntPoolCol(0)
00225     , fUIntPoolRowTotal(2)
00226     , fScannerId(0)
00227     , fSequenceId(0)
00228     , fAttrList(0)
00229     , fAttrDupChkRegistry(0)
00230     , fDocHandler(docHandler)
00231     , fDocTypeHandler(docTypeHandler)
00232     , fEntityHandler(entityHandler)
00233     , fErrorReporter(errHandler)
00234     , fErrorHandler(0)
00235     , fPSVIHandler(0)
00236     , fValidationContext(0)
00237     , fEntityDeclPoolRetrieved(false)
00238     , fReaderMgr(manager)
00239     , fValidator(valToAdopt)
00240     , fValScheme(Val_Never)
00241     , fGrammarResolver(grammarResolver)
00242     , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
00243     , fGrammar(0)
00244     , fRootGrammar(0)
00245     , fURIStringPool(0)
00246     , fRootElemName(0)
00247     , fExternalSchemaLocation(0)
00248     , fExternalNoNamespaceSchemaLocation(0)
00249     , fSecurityManager(0)
00250     , fXMLVersion(XMLReader::XMLV1_0)
00251     , fMemoryManager(manager)
00252     , fBufMgr(manager)
00253     , fAttNameBuf(1023, manager)
00254     , fAttValueBuf(1023, manager)
00255     , fCDataBuf(1023, manager)
00256     , fQNameBuf(1023, manager)
00257     , fPrefixBuf(1023, manager)
00258     , fURIBuf(1023, manager)
00259     , fWSNormalizeBuf(1023, manager)
00260     , fElemStack(manager)
00261 {
00262     CleanupType cleanup(this, &XMLScanner::cleanUp);
00263 
00264     try
00265     {
00266         commonInit();
00267     }
00268     catch(const OutOfMemoryException&)
00269     {
00270         // Don't cleanup when out of memory, since executing the
00271         // code can cause problems.
00272         cleanup.release();
00273 
00274         throw;
00275     }
00276 
00277     cleanup.release();
00278 }
00279 
00280 XMLScanner::~XMLScanner()
00281 {
00282     cleanUp();
00283 }
00284 
00285 void XMLScanner::resetCachedGrammar ()
00286 {
00287 }
00288 
00289 void XMLScanner::setValidator(XMLValidator* const valToAdopt)
00290 {
00291     if (fValidatorFromUser)
00292         delete fValidator;
00293     fValidator = valToAdopt;
00294     fValidatorFromUser = true;
00295     initValidator(fValidator);
00296 }
00297 
00298 
00299 
00300 // ---------------------------------------------------------------------------
00301 //  XMLScanner: Main entry point to scan a document
00302 // ---------------------------------------------------------------------------
00303 void XMLScanner::scanDocument(  const   XMLCh* const    systemId)
00304 {
00305     //  First we try to parse it as a URL. If that fails, we assume its
00306     //  a file and try it that way.
00307     InputSource* srcToUse = 0;
00308     try
00309     {
00310         //  Create a temporary URL. Since this is the primary document,
00311         //  it has to be fully qualified. If not, then assume we are just
00312         //  mistaking a file for a URL.
00313         XMLURL tmpURL(fMemoryManager);
00314 
00315         if (XMLURL::parse(systemId, tmpURL)) {
00316 
00317             if (tmpURL.isRelative()) {
00318                 if (!fStandardUriConformant)
00319                     srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
00320                 else {
00321                     // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
00322                     // emit the error directly
00323                     MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
00324                     fInException = true;
00325                     emitError
00326                     (
00327                         XMLErrs::XMLException_Fatal
00328                         , e.getCode()
00329                         , e.getMessage()
00330                     );
00331                     return;
00332                 }
00333             }
00334             else
00335             {
00336                 if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
00337                     MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
00338                     fInException = true;
00339                     emitError
00340                     (
00341                         XMLErrs::XMLException_Fatal
00342                         , e.getCode()
00343                         , e.getMessage()
00344                     );
00345                     return;
00346                 }
00347                 srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
00348             }
00349         }
00350         else {
00351 
00352             if (!fStandardUriConformant)
00353                 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
00354             else {
00355                 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
00356                 // emit the error directly
00357                 // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
00358                 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
00359                 fInException = true;
00360                 emitError
00361                 (
00362                     XMLErrs::XMLException_Fatal
00363                     , e.getCode()
00364                     , e.getMessage()
00365                 );
00366                 return;
00367             }
00368         }
00369     }
00370     catch(const XMLException& excToCatch)
00371     {
00372         //  For any other XMLException,
00373         //  emit the error and catch any user exception thrown from here.
00374         fInException = true;
00375         if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
00376             emitError
00377             (
00378                 XMLErrs::XMLException_Warning
00379                 , excToCatch.getCode()
00380                 , excToCatch.getMessage()
00381             );
00382         else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
00383             emitError
00384             (
00385                 XMLErrs::XMLException_Fatal
00386                 , excToCatch.getCode()
00387                 , excToCatch.getMessage()
00388             );
00389         else
00390             emitError
00391             (
00392                 XMLErrs::XMLException_Error
00393                 , excToCatch.getCode()
00394                 , excToCatch.getMessage()
00395             );
00396         return;
00397     }
00398 
00399     Janitor<InputSource> janSrc(srcToUse);
00400     scanDocument(*srcToUse);
00401 }
00402 
00403 void XMLScanner::scanDocument(  const   char* const systemId)
00404 {
00405     // We just delegate this to the XMLCh version after transcoding
00406     XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
00407     ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
00408     scanDocument(tmpBuf);
00409 }
00410 
00411 
00412 //  This method begins a progressive parse. It scans through the prolog and
00413 //  returns a token to be used on subsequent scanNext() calls. If the return
00414 //  value is true, then the token is legal and ready for further use. If it
00415 //  returns false, then the scan of the prolog failed and the token is not
00416 //  going to work on subsequent scanNext() calls.
00417 bool XMLScanner::scanFirst( const   XMLCh* const    systemId
00418                             ,       XMLPScanToken&  toFill)
00419 {
00420     //  First we try to parse it as a URL. If that fails, we assume its
00421     //  a file and try it that way.
00422     InputSource* srcToUse = 0;
00423     try
00424     {
00425         //  Create a temporary URL. Since this is the primary document,
00426         //  it has to be fully qualified. If not, then assume we are just
00427         //  mistaking a file for a URL.
00428         XMLURL tmpURL(fMemoryManager);
00429         if (XMLURL::parse(systemId, tmpURL)) {
00430             if (tmpURL.isRelative()) {
00431                 if (!fStandardUriConformant)
00432                     srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
00433                 else {
00434                     // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
00435                     // emit the error directly
00436                     MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
00437                     fInException = true;
00438                     emitError
00439                     (
00440                         XMLErrs::XMLException_Fatal
00441                         , e.getCode()
00442                         , e.getMessage()
00443                     );
00444                     return false;
00445                 }
00446             }
00447             else
00448             {
00449                 if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
00450                     MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
00451                     fInException = true;
00452                     emitError
00453                     (
00454                         XMLErrs::XMLException_Fatal
00455                         , e.getCode()
00456                         , e.getMessage()
00457                     );
00458                     return false;
00459                 }
00460                 srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
00461             }
00462         }
00463         else {
00464             if (!fStandardUriConformant)
00465                 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId,  fMemoryManager);
00466             else {
00467                 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
00468                 // emit the error directly
00469                 // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
00470                 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
00471                 fInException = true;
00472                 emitError
00473                 (
00474                     XMLErrs::XMLException_Fatal
00475                     , e.getCode()
00476                     , e.getMessage()
00477                 );
00478                 return false;
00479             }
00480         }
00481     }
00482     catch(const XMLException& excToCatch)
00483     {
00484         //  For any other XMLException,
00485         //  emit the error and catch any user exception thrown from here.
00486         fInException = true;
00487         if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
00488             emitError
00489             (
00490                 XMLErrs::XMLException_Warning
00491                 , excToCatch.getCode()
00492                 , excToCatch.getMessage()
00493             );
00494         else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
00495             emitError
00496             (
00497                 XMLErrs::XMLException_Fatal
00498                 , excToCatch.getCode()
00499                 , excToCatch.getMessage()
00500             );
00501         else
00502             emitError
00503             (
00504                 XMLErrs::XMLException_Error
00505                 , excToCatch.getCode()
00506                 , excToCatch.getMessage()
00507             );
00508         return false;
00509     }
00510 
00511     Janitor<InputSource> janSrc(srcToUse);
00512     return scanFirst(*srcToUse, toFill);
00513 }
00514 
00515 bool XMLScanner::scanFirst( const   char* const     systemId
00516                             ,       XMLPScanToken&  toFill)
00517 {
00518     // We just delegate this to the XMLCh version after transcoding
00519     XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
00520     ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
00521     return scanFirst(tmpBuf, toFill);
00522 }
00523 
00524 bool XMLScanner::scanFirst( const   InputSource&    src
00525                            ,       XMLPScanToken&  toFill)
00526 {
00527     //  Bump up the sequence id for this new scan cycle. This will invalidate
00528     //  any previous tokens we've returned.
00529     fSequenceId++;
00530 
00531     ReaderMgrResetType  resetReaderMgr(&fReaderMgr, &ReaderMgr::reset);
00532 
00533    // Reset the scanner and its plugged in stuff for a new run.  This
00534     // resets all the data structures, creates the initial reader and
00535     // pushes it on the stack, and sets up the base document path
00536     scanReset(src);
00537 
00538     // If we have a document handler, then call the start document
00539     if (fDocHandler)
00540         fDocHandler->startDocument();
00541 
00542     try
00543     {
00544         //  Scan the prolog part, which is everything before the root element
00545         //  including the DTD subsets. This is all that is done on the scan
00546         //  first.
00547         scanProlog();
00548 
00549         //  If we got to the end of input, then its not a valid XML file.
00550         //  Else, go on to scan the content.
00551         if (fReaderMgr.atEOF())
00552         {
00553             emitError(XMLErrs::EmptyMainEntity);
00554         }
00555     }
00556     //  NOTE:
00557     //
00558     //  In all of the error processing below, the emitError() call MUST come
00559     //  before the flush of the reader mgr, or it will fail because it tries
00560     //  to find out the position in the XML source of the error.
00561     catch(const XMLErrs::Codes)
00562     {
00563         // This is a 'first failure' exception so return failure
00564         return false;
00565     }
00566     catch(const XMLValid::Codes)
00567     {
00568         // This is a 'first fatal error' type exit, return failure
00569         return false;
00570     }
00571     catch(const XMLException& excToCatch)
00572     {
00573         //  Emit the error and catch any user exception thrown from here. Make
00574         //  sure in all cases we flush the reader manager.
00575         fInException = true;
00576         try
00577         {
00578             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
00579                 emitError
00580                 (
00581                     XMLErrs::XMLException_Warning
00582                     , excToCatch.getCode()
00583                     , excToCatch.getMessage()
00584                 );
00585             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
00586                 emitError
00587                 (
00588                     XMLErrs::XMLException_Fatal
00589                     , excToCatch.getCode()
00590                     , excToCatch.getMessage()
00591                 );
00592             else
00593                 emitError
00594                 (
00595                     XMLErrs::XMLException_Error
00596                     , excToCatch.getCode()
00597                     , excToCatch.getMessage()
00598                 );
00599         }
00600         catch(const OutOfMemoryException&)
00601         {
00602             // This is a special case for out-of-memory
00603             // conditions, because resetting the ReaderMgr
00604             // can be problematic.
00605             resetReaderMgr.release();
00606 
00607             throw;
00608         }
00609 
00610         return false;
00611     }
00612     catch(const OutOfMemoryException&)
00613     {
00614         // This is a special case for out-of-memory
00615         // conditions, because resetting the ReaderMgr
00616         // can be problematic.
00617         resetReaderMgr.release();
00618 
00619         throw;
00620     }
00621 
00622     // Fill in the caller's token to make it legal and return success
00623     toFill.set(fScannerId, fSequenceId);
00624 
00625     // Release the object that will reset the ReaderMgr, since there's
00626     // more to scan.
00627     resetReaderMgr.release();
00628 
00629     return true;
00630 }
00631 
00632 
00633 void XMLScanner::scanReset(XMLPScanToken& token)
00634 {
00635     // Make sure this token is still legal
00636     if (!isLegalToken(token))
00637         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
00638 
00639     // Reset the reader manager
00640     fReaderMgr.reset();
00641 
00642     // And invalidate any tokens by bumping our sequence number
00643     fSequenceId++;
00644 
00645     // Reset our error count
00646     fErrorCount = 0;
00647 }
00648 
00649 void XMLScanner::setParseSettings(XMLScanner* const refScanner)
00650 {
00651     setDocHandler(refScanner->getDocHandler());
00652     setDocTypeHandler(refScanner->getDocTypeHandler());
00653     setErrorHandler(refScanner->getErrorHandler());
00654     setErrorReporter(refScanner->getErrorReporter());
00655     setEntityHandler(refScanner->getEntityHandler());
00656     setDoNamespaces(refScanner->getDoNamespaces());
00657     setDoSchema(refScanner->getDoSchema());
00658     setCalculateSrcOfs(refScanner->getCalculateSrcOfs());
00659     setStandardUriConformant(refScanner->getStandardUriConformant());
00660     setExitOnFirstFatal(refScanner->getExitOnFirstFatal());
00661     setValidationConstraintFatal(refScanner->getValidationConstraintFatal());
00662     setIdentityConstraintChecking(refScanner->getIdentityConstraintChecking());
00663     setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking());
00664     cacheGrammarFromParse(refScanner->isCachingGrammarFromParse());
00665     useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse());
00666     setLoadExternalDTD(refScanner->getLoadExternalDTD());
00667     setLoadSchema(refScanner->getLoadSchema());
00668     setNormalizeData(refScanner->getNormalizeData());
00669     setExternalSchemaLocation(refScanner->getExternalSchemaLocation());
00670     setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation());
00671     setValidationScheme(refScanner->getValidationScheme());
00672     setSecurityManager(refScanner->getSecurityManager());
00673     setPSVIHandler(refScanner->getPSVIHandler());
00674 }
00675 
00676 // ---------------------------------------------------------------------------
00677 //  XMLScanner: Private helper methods.
00678 // ---------------------------------------------------------------------------
00679 
00680 //  This method handles the common initialization, to avoid having to do
00681 //  it redundantly in multiple constructors.
00682 void XMLScanner::commonInit()
00683 {
00684     //  We have to do a little init that involves statics, so we have to
00685     //  use the mutex to protect it.
00686     {
00687         XMLMutexLock lockInit(sScannerMutex);
00688 
00689         // And assign ourselves the next available scanner id
00690         fScannerId = ++gScannerId;
00691     }
00692 
00693     //  Create the attribute list, which is used to store attribute values
00694     //  during start tag processing. Give it a reasonable initial size that
00695     //  will serve for most folks, though it will grow as required.
00696     fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager);
00697 
00698     //  Create the id ref list. This is used to enforce XML 1.0 ID ref
00699     //  semantics, i.e. all id refs must refer to elements that exist
00700     fValidationContext = new (fMemoryManager) ValidationContextImpl(fMemoryManager);
00701     fValidationContext->setElemStack(&fElemStack);
00702     fValidationContext->setScanner(this);
00703 
00704     //  Create the GrammarResolver
00705     //fGrammarResolver = new GrammarResolver();
00706 
00707     // create initial, 64-element, fUIntPool
00708     fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) *fUIntPoolRowTotal);
00709     memset(fUIntPool, 0, sizeof(unsigned int *) * fUIntPoolRowTotal);
00710     fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
00711     memset(fUIntPool[0], 0, sizeof(unsigned int) << 6);
00712 
00713     // Register self as handler for XMLBufferFull events on the CDATA buffer
00714     fCDataBuf.setFullHandler(this, fBufferSize);
00715 
00716    if (fValidator) {
00717        fValidatorFromUser = true;
00718        initValidator(fValidator);
00719    }
00720 }
00721 
00722 void XMLScanner::cleanUp()
00723 {
00724     delete fAttrList;
00725     delete fAttrDupChkRegistry;
00726     delete fValidationContext;
00727     fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
00728     fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation;
00729     fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation;
00730     // delete fUIntPool
00731     if (fUIntPool)
00732     {
00733         for (unsigned int i=0; i<=fUIntPoolRow; i++)
00734         {
00735             fMemoryManager->deallocate(fUIntPool[i]);
00736         }
00737         fMemoryManager->deallocate(fUIntPool);
00738     }
00739 }
00740 
00741 void XMLScanner::initValidator(XMLValidator* theValidator) {
00742 
00743     //  Tell the validator about the stuff it needs to know in order to
00744     //  do its work.
00745     theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr);
00746     theValidator->setErrorReporter(fErrorReporter);
00747 }
00748 
00749 // ---------------------------------------------------------------------------
00750 //  XMLScanner: Error emitting methods
00751 // ---------------------------------------------------------------------------
00752 
00753 //  These methods are called whenever the scanner wants to emit an error.
00754 //  It handles getting the message loaded, doing token replacement, etc...
00755 //  and then calling the error handler, if its installed.
00756 bool XMLScanner::emitErrorWillThrowException(const XMLErrs::Codes toEmit)
00757 {
00758     if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
00759         return true;
00760     return false;
00761 }
00762 
00763 void XMLScanner::emitError(const XMLErrs::Codes toEmit)
00764 {
00765     // Bump the error count if it is not a warning
00766     if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
00767         incrementErrorCount();
00768 
00769     if (fErrorReporter)
00770     {
00771         // Load the message into a local for display
00772         const XMLSize_t msgSize = 1023;
00773         XMLCh errText[msgSize + 1];
00774 
00775         if (!gMsgLoader->loadMsg(toEmit, errText, msgSize))
00776         {
00777                 // <TBD> Probably should load a default msg here
00778         }
00779 
00780         //  Create a LastExtEntityInfo structure and get the reader manager
00781         //  to fill it in for us. This will give us the information about
00782         //  the last reader on the stack that was an external entity of some
00783         //  sort (i.e. it will ignore internal entities.
00784         ReaderMgr::LastExtEntityInfo lastInfo;
00785         fReaderMgr.getLastExtEntityInfo(lastInfo);
00786 
00787         fErrorReporter->error
00788         (
00789             toEmit
00790             , XMLUni::fgXMLErrDomain
00791             , XMLErrs::errorType(toEmit)
00792             , errText
00793             , lastInfo.systemId
00794             , lastInfo.publicId
00795             , lastInfo.lineNumber
00796             , lastInfo.colNumber
00797         );
00798     }
00799 
00800     // Bail out if its fatal an we are to give up on the first fatal error
00801     if (emitErrorWillThrowException(toEmit))
00802         throw toEmit;
00803 }
00804 
00805 void XMLScanner::emitError( const   XMLErrs::Codes    toEmit
00806                             , const XMLCh* const        text1
00807                             , const XMLCh* const        text2
00808                             , const XMLCh* const        text3
00809                             , const XMLCh* const        text4)
00810 {
00811     // Bump the error count if it is not a warning
00812     if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
00813         incrementErrorCount();
00814 
00815     if (fErrorReporter)
00816     {
00817         //  Load the message into alocal and replace any tokens found in
00818         //  the text.
00819         const XMLSize_t maxChars = 2047;
00820         XMLCh errText[maxChars + 1];
00821 
00822         if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
00823         {
00824                 // <TBD> Should probably load a default message here
00825         }
00826 
00827         //  Create a LastExtEntityInfo structure and get the reader manager
00828         //  to fill it in for us. This will give us the information about
00829         //  the last reader on the stack that was an external entity of some
00830         //  sort (i.e. it will ignore internal entities.
00831         ReaderMgr::LastExtEntityInfo lastInfo;
00832         fReaderMgr.getLastExtEntityInfo(lastInfo);
00833 
00834         fErrorReporter->error
00835         (
00836             toEmit
00837             , XMLUni::fgXMLErrDomain
00838             , XMLErrs::errorType(toEmit)
00839             , errText
00840             , lastInfo.systemId
00841             , lastInfo.publicId
00842             , lastInfo.lineNumber
00843             , lastInfo.colNumber
00844         );
00845     }
00846 
00847     // Bail out if its fatal an we are to give up on the first fatal error
00848     if (emitErrorWillThrowException(toEmit))
00849         throw toEmit;
00850 }
00851 
00852 void XMLScanner::emitError( const   XMLErrs::Codes    toEmit
00853                             , const char* const         text1
00854                             , const char* const         text2
00855                             , const char* const         text3
00856                             , const char* const         text4)
00857 {
00858     // Bump the error count if it is not a warning
00859     if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
00860         incrementErrorCount();
00861 
00862     if (fErrorReporter)
00863     {
00864         //  Load the message into alocal and replace any tokens found in
00865         //  the text.
00866         const XMLSize_t maxChars = 2047;
00867         XMLCh errText[maxChars + 1];
00868 
00869         if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
00870         {
00871                 // <TBD> Should probably load a default message here
00872         }
00873 
00874         //  Create a LastExtEntityInfo structure and get the reader manager
00875         //  to fill it in for us. This will give us the information about
00876         //  the last reader on the stack that was an external entity of some
00877         //  sort (i.e. it will ignore internal entities.
00878         ReaderMgr::LastExtEntityInfo lastInfo;
00879         fReaderMgr.getLastExtEntityInfo(lastInfo);
00880 
00881         fErrorReporter->error
00882         (
00883             toEmit
00884             , XMLUni::fgXMLErrDomain
00885             , XMLErrs::errorType(toEmit)
00886             , errText
00887             , lastInfo.systemId
00888             , lastInfo.publicId
00889             , lastInfo.lineNumber
00890             , lastInfo.colNumber
00891         );
00892     }
00893 
00894     // Bail out if its fatal an we are to give up on the first fatal error
00895     if (emitErrorWillThrowException(toEmit))
00896         throw toEmit;
00897 }
00898 
00899 void XMLScanner::emitError( const   XMLErrs::Codes      toEmit
00900                             , const XMLExcepts::Codes   originalExceptCode
00901                             , const XMLCh* const        text1
00902                             , const XMLCh* const        text2
00903                             , const XMLCh* const        text3
00904                             , const XMLCh* const        text4)
00905 {
00906     // Bump the error count if it is not a warning
00907     if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
00908         incrementErrorCount();
00909 
00910     if (fErrorReporter)
00911     {
00912         //  Load the message into alocal and replace any tokens found in
00913         //  the text.
00914         const XMLSize_t maxChars = 2047;
00915         XMLCh errText[maxChars + 1];
00916 
00917         if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
00918         {
00919                 // <TBD> Should probably load a default message here
00920         }
00921 
00922         //  Create a LastExtEntityInfo structure and get the reader manager
00923         //  to fill it in for us. This will give us the information about
00924         //  the last reader on the stack that was an external entity of some
00925         //  sort (i.e. it will ignore internal entities.
00926         ReaderMgr::LastExtEntityInfo lastInfo;
00927         fReaderMgr.getLastExtEntityInfo(lastInfo);
00928 
00929         fErrorReporter->error
00930         (
00931             originalExceptCode
00932             , XMLUni::fgExceptDomain    //fgXMLErrDomain
00933             , XMLErrs::errorType(toEmit)
00934             , errText
00935             , lastInfo.systemId
00936             , lastInfo.publicId
00937             , lastInfo.lineNumber
00938             , lastInfo.colNumber
00939         );
00940     }
00941 
00942     // Bail out if its fatal an we are to give up on the first fatal error
00943     if (emitErrorWillThrowException(toEmit))
00944         throw toEmit;
00945 }
00946 
00947 // ---------------------------------------------------------------------------
00948 //  XMLScanner: Getter methods
00949 // ---------------------------------------------------------------------------
00950 
00951 //  This method allows the caller to query the current location of the scanner.
00952 //  It will return the sys/public ids of the current entity, and the line/col
00953 //  position within it.
00954 //
00955 //  NOTE: This API returns the location with the last external file. So if its
00956 //  currently scanning an entity, the position returned will be the end of
00957 //  the entity reference in the file that had the reference.
00958 //
00959 /*bool
00960 XMLScanner::getLastExtLocation(         XMLCh* const    sysIdToFill
00961                                 , const unsigned int    maxSysIdChars
00962                                 ,       XMLCh* const    pubIdToFill
00963                                 , const unsigned int    maxPubIdChars
00964                                 ,       XMLSSize_t&     lineToFill
00965                                 ,       XMLSSize_t&     colToFill) const
00966 {
00967     // Create a local info object and get it filled in by the reader manager
00968     ReaderMgr::LastExtEntityInfo lastInfo;
00969     fReaderMgr.getLastExtEntityInfo(lastInfo);
00970 
00971     // Fill in the line and column number
00972     lineToFill = lastInfo.lineNumber;
00973     colToFill = lastInfo.colNumber;
00974 
00975     // And copy over as much of the ids as will fit
00976     sysIdToFill[0] = 0;
00977     if (lastInfo.systemId)
00978     {
00979         if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
00980             return false;
00981         XMLString::copyString(sysIdToFill, lastInfo.systemId);
00982     }
00983 
00984     pubIdToFill[0] = 0;
00985     if (lastInfo.publicId)
00986     {
00987         if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
00988             return false;
00989         XMLString::copyString(pubIdToFill, lastInfo.publicId);
00990     }
00991     return true;
00992 }*/
00993 
00994 
00995 // ---------------------------------------------------------------------------
00996 //  XMLScanner: Private scanning methods
00997 // ---------------------------------------------------------------------------
00998 
00999 //  This method is called after the end of the root element, to handle
01000 //  any miscellaneous stuff hanging around.
01001 void XMLScanner::scanMiscellaneous()
01002 {
01003     // Get a buffer for this work
01004     XMLBufBid bbCData(&fBufMgr);
01005 
01006     while (true)
01007     {
01008         try
01009         {
01010             const XMLCh nextCh = fReaderMgr.peekNextChar();
01011 
01012             // Watch for end of file and break out
01013             if (!nextCh)
01014                 break;
01015 
01016             if (nextCh == chOpenAngle)
01017             {
01018                 if (checkXMLDecl(true))
01019                 {
01020                     // Can't have an XML decl here
01021                     emitError(XMLErrs::NotValidAfterContent);
01022                     fReaderMgr.skipPastChar(chCloseAngle);
01023                 }
01024                 else if (fReaderMgr.skippedString(XMLUni::fgPIString))
01025                 {
01026                     scanPI();
01027                 }
01028                  else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
01029                 {
01030                     scanComment();
01031                 }
01032                 else
01033                 {
01034                     // This can't be possible, so just give up
01035                     emitError(XMLErrs::ExpectedCommentOrPI);
01036                     fReaderMgr.skipPastChar(chCloseAngle);
01037                 }
01038             }
01039             else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
01040             {
01041                 //  If we have a doc handler, then gather up the spaces and
01042                 //  call back. Otherwise, just skip over whitespace.
01043                 if (fDocHandler)
01044                 {
01045                     fReaderMgr.getSpaces(bbCData.getBuffer());
01046                     fDocHandler->ignorableWhitespace
01047                     (
01048                         bbCData.getRawBuffer()
01049                         , bbCData.getLen()
01050                         , false
01051                     );
01052                 }
01053                 else
01054                 {
01055                     fReaderMgr.skipPastSpaces();
01056                 }
01057             }
01058             else
01059             {
01060                 emitError(XMLErrs::ExpectedCommentOrPI);
01061                 fReaderMgr.skipPastChar(chCloseAngle);
01062             }
01063         }
01064         catch(const EndOfEntityException&)
01065         {
01066             //  Some entity leaked out of the content part of the document. Issue
01067             //  a warning and keep going.
01068             emitError(XMLErrs::EntityPropogated);
01069         }
01070     }
01071 }
01072 
01073 
01074 //  Scans a PI and calls the appropriate callbacks. At entry we have just
01075 //  scanned the <? part, and need to now start on the PI target name.
01076 void XMLScanner::scanPI()
01077 {
01078     const XMLCh* namePtr = 0;
01079     const XMLCh* targetPtr = 0;
01080 
01081     //  If there are any spaces here, then warn about it. If we aren't in
01082     //  'first error' mode, then we'll come back and can easily pick up
01083     //  again by just skipping them.
01084     if (fReaderMgr.lookingAtSpace())
01085     {
01086         emitError(XMLErrs::PINameExpected);
01087         fReaderMgr.skipPastSpaces();
01088     }
01089 
01090     // Get a buffer for the PI name and scan it in
01091     XMLBufBid bbName(&fBufMgr);
01092     if (!fReaderMgr.getName(bbName.getBuffer()))
01093     {
01094         emitError(XMLErrs::PINameExpected);
01095         fReaderMgr.skipPastChar(chCloseAngle);
01096         return;
01097     }
01098 
01099     // Point the name pointer at the raw data
01100     namePtr = bbName.getRawBuffer();
01101 
01102     // See if it is some form of 'xml' and emit a warning
01103     //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
01104     if (bbName.getLen() == 3 &&
01105         (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
01106          ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
01107          ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))
01108         emitError(XMLErrs::NoPIStartsWithXML);
01109 
01110     // If namespaces are enabled, then no colons allowed
01111     if (fDoNamespaces)
01112     {
01113         if (XMLString::indexOf(namePtr, chColon) != -1)
01114             emitError(XMLErrs::ColonNotLegalWithNS);
01115     }
01116 
01117     //  If we don't hit a space next, then the PI has no target. If we do
01118     //  then get out the target. Get a buffer for it as well
01119     XMLBufBid bbTarget(&fBufMgr);
01120     if (fReaderMgr.skippedSpace())
01121     {
01122         // Skip any leading spaces
01123         fReaderMgr.skipPastSpaces();
01124 
01125         bool gotLeadingSurrogate = false;
01126 
01127         // It does have a target, so lets move on to deal with that.
01128         while (1)
01129         {
01130             const XMLCh nextCh = fReaderMgr.getNextChar();
01131 
01132             // Watch for an end of file, which is always bad here
01133             if (!nextCh)
01134             {
01135                 emitError(XMLErrs::UnterminatedPI);
01136                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
01137             }
01138 
01139             // Watch for potential terminating character
01140             if (nextCh == chQuestion)
01141             {
01142                 // It must be followed by '>' to be a termination of the target
01143                 if (fReaderMgr.skippedChar(chCloseAngle))
01144                     break;
01145             }
01146 
01147             // Check for correct surrogate pairs
01148             if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
01149             {
01150                 if (gotLeadingSurrogate)
01151                     emitError(XMLErrs::Expected2ndSurrogateChar);
01152                 else
01153                     gotLeadingSurrogate = true;
01154             }
01155              else
01156             {
01157                 if (gotLeadingSurrogate)
01158                 {
01159                     if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
01160                         emitError(XMLErrs::Expected2ndSurrogateChar);
01161                 }
01162                 // Its got to at least be a valid XML character
01163                 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
01164 
01165                     XMLCh tmpBuf[9];
01166                     XMLString::binToText
01167                     (
01168                         nextCh
01169                         , tmpBuf
01170                         , 8
01171                         , 16
01172                         , fMemoryManager
01173                     );
01174                     emitError(XMLErrs::InvalidCharacter, tmpBuf);
01175                 }
01176 
01177                 gotLeadingSurrogate = false;
01178             }
01179 
01180             bbTarget.append(nextCh);
01181         }
01182     }
01183     else
01184     {
01185         // No target, but make sure its terminated ok
01186         if (!fReaderMgr.skippedChar(chQuestion))
01187         {
01188             emitError(XMLErrs::UnterminatedPI);
01189             fReaderMgr.skipPastChar(chCloseAngle);
01190             return;
01191         }
01192 
01193         if (!fReaderMgr.skippedChar(chCloseAngle))
01194         {
01195             emitError(XMLErrs::UnterminatedPI);
01196             fReaderMgr.skipPastChar(chCloseAngle);
01197             return;
01198         }
01199     }
01200 
01201     // Point the target pointer at the raw data
01202     targetPtr = bbTarget.getRawBuffer();
01203 
01204     // If we have a handler, then call it
01205     if (fDocHandler)
01206     {
01207         fDocHandler->docPI
01208         (
01209             namePtr
01210             , targetPtr
01211        );
01212     }
01213 
01214     //mark PI is seen within the current element
01215     if (! fElemStack.isEmpty())
01216         fElemStack.setCommentOrPISeen();
01217 
01218 }
01219 
01220 //  Scans all the input from the start of the file to the root element.
01221 //  There does not have to be anything in the prolog necessarily, but usually
01222 //  there is at least an XMLDecl.
01223 //
01224 //  On exit from here we are either at the end of the file or about to read
01225 //  the opening < of the root element.
01226 void XMLScanner::scanProlog()
01227 {
01228     bool sawDocTypeDecl = false;
01229     // Get a buffer for whitespace processing
01230     XMLBufBid bbCData(&fBufMgr);
01231 
01232     //  Loop through the prolog. If there is no content, this could go all
01233     //  the way to the end of the file.
01234     try
01235     {
01236         while (true)
01237         {
01238             const XMLCh nextCh = fReaderMgr.peekNextChar();
01239 
01240             if (nextCh == chOpenAngle)
01241             {
01242                 //  Ok, it could be the xml decl, a comment, the doc type line,
01243                 //  or the start of the root element.
01244                 if (checkXMLDecl(true))
01245                 {
01246                     // There shall be at lease --ONE-- space in between
01247                     // the tag '<?xml' and the VersionInfo.
01248                     //
01249                     //  If we are not at line 1, col 6, then the decl was not
01250                     //  the first text, so its invalid.
01251                     const XMLReader* curReader = fReaderMgr.getCurrentReader();
01252                     if ((curReader->getLineNumber() != 1)
01253                     ||  (curReader->getColumnNumber() != 7))
01254                     {
01255                         emitError(XMLErrs::XMLDeclMustBeFirst);
01256                     }
01257 
01258                     scanXMLDecl(Decl_XML);
01259                 }
01260                 else if (fReaderMgr.skippedString(XMLUni::fgPIString))
01261                 {
01262                     scanPI();
01263                 }
01264                  else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
01265                 {
01266                     scanComment();
01267                 }
01268                  else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
01269                 {
01270                     if (sawDocTypeDecl) {
01271                         emitError(XMLErrs::DuplicateDocTypeDecl);
01272                     }
01273                     scanDocTypeDecl();
01274                     sawDocTypeDecl = true;
01275 
01276                     // if reusing grammar, this has been validated already in first scan
01277                     // skip for performance
01278                     if (fValidate && fGrammar && !fGrammar->getValidated()) {
01279                         //  validate the DTD scan so far
01280                         fValidator->preContentValidation(fUseCachedGrammar, true);
01281                     }
01282                 }
01283                 else
01284                 {
01285                     // Assume its the start of the root element
01286                     return;
01287                 }
01288             }
01289             else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
01290             {
01291                 //  If we have a document handler then gather up the
01292                 //  whitespace and call back. Otherwise just skip over spaces.
01293                 if (fDocHandler)
01294                 {
01295                     fReaderMgr.getSpaces(bbCData.getBuffer());
01296                     fDocHandler->ignorableWhitespace
01297                     (
01298                         bbCData.getRawBuffer()
01299                         , bbCData.getLen()
01300                         , false
01301                     );
01302                 }
01303                  else
01304                 {
01305                     fReaderMgr.skipPastSpaces();
01306                 }
01307             }
01308              else
01309             {
01310                 emitError(XMLErrs::InvalidDocumentStructure);
01311 
01312                 // Watch for end of file and break out
01313                 if (!nextCh)
01314                     break;
01315                 else
01316                     fReaderMgr.skipPastChar(chCloseAngle);
01317             }
01318 
01319         }
01320     }
01321     catch(const EndOfEntityException&)
01322     {
01323         //  We should never get an end of entity here. They should only
01324         //  occur within the doc type scanning method, and not leak out to
01325         //  here.
01326         emitError
01327         (
01328             XMLErrs::UnexpectedEOE
01329             , "in prolog"
01330         );
01331     }
01332 }
01333 
01334 
01335 //  Scans the <?xml .... ?> line. This stuff is all sequential so we don't
01336 //  do any state machine loop here. We just bull straight through it. It ends
01337 //  past the closing bracket. If there is a document handler, then its called
01338 //  on the XMLDecl callback.
01339 //
01340 //  On entry, the <?xml has been scanned, and we pick it up from there.
01341 //
01342 //  NOTE: In order to provide good recovery from bad XML here, we try to be
01343 //  very flexible. No matter what order the stuff is in, we'll keep going
01344 //  though we'll issue errors.
01345 //
01346 //  The parameter tells us which type of decl we should expect, Text or XML.
01347 //    [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
01348 //    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
01349 void XMLScanner::scanXMLDecl(const DeclTypes type)
01350 {
01351     // Get us some buffers to use
01352     XMLBufBid bbVersion(&fBufMgr);
01353     XMLBufBid bbEncoding(&fBufMgr);
01354     XMLBufBid bbStand(&fBufMgr);
01355     XMLBufBid bbDummy(&fBufMgr);
01356     XMLBufBid bbName(&fBufMgr);
01357 
01358     //  We use this little enum and array to keep up with what we found
01359     //  and what order we found them in. This lets us get them free form
01360     //  without too much overhead, but still know that they were in the
01361     //  wrong order.
01362     enum Strings
01363     {
01364         VersionString
01365         , EncodingString
01366         , StandaloneString
01367         , UnknownString
01368 
01369         , StringCount
01370     };
01371     int flags[StringCount] = { -1, -1, -1, -1 };
01372 
01373     //  Also set up a list of buffers in the right order so that we know
01374     //  where to put stuff.
01375     XMLBuffer* buffers[StringCount] ;
01376     buffers[0] = &bbVersion.getBuffer();
01377     buffers[1] = &bbEncoding.getBuffer();
01378     buffers[2] = &bbStand.getBuffer();
01379     buffers[3] = &bbDummy.getBuffer();
01380 
01381     int curCount = 0;
01382     Strings curString;
01383     XMLBuffer& nameBuf = bbName.getBuffer();
01384     while (true)
01385     {
01386         // Skip any spaces
01387         bool skippedSomething;
01388         fReaderMgr.skipPastSpaces(skippedSomething, true);
01389 
01390         // If we are looking at a question mark, then break out
01391         if (fReaderMgr.lookingAtChar(chQuestion))
01392             break;
01393 
01394         // If this is not the first string, then we require the spaces
01395         if (!skippedSomething && curCount)
01396             emitError(XMLErrs::ExpectedWhitespace);
01397 
01398         //  Get characters up to the next whitespace or equal's sign.
01399         if (!scanUpToWSOr(nameBuf, chEqual))
01400             emitError(XMLErrs::ExpectedDeclString);
01401 
01402         // See if it matches any of our expected strings
01403         if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
01404             curString = VersionString;
01405         else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
01406             curString = EncodingString;
01407         else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
01408             curString = StandaloneString;
01409         else
01410             curString = UnknownString;
01411 
01412         //  If its an unknown string, then give that error. Else check to
01413         //  see if this one has been done already and give that error.
01414         if (curString == UnknownString)
01415             emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
01416         else if (flags[curString] != -1)
01417             emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
01418         else if (flags[curString] == -1)
01419             flags[curString] = ++curCount;
01420 
01421         //  Scan for an equal's sign. If we don't find it, issue an error
01422         //  but keep trying to go on.
01423         if (!scanEq(true))
01424             emitError(XMLErrs::ExpectedEqSign);
01425 
01426         //  Get a quote string into the buffer for the string that we are
01427         //  currently working on.
01428         if (!getQuotedString(*buffers[curString]))
01429         {
01430             emitError(XMLErrs::ExpectedQuotedString);
01431             fReaderMgr.skipPastChar(chCloseAngle);
01432             return;
01433         }
01434 
01435         // And validate the value according which one it was
01436         const XMLCh* rawValue = buffers[curString]->getRawBuffer();
01437         if (curString == VersionString)
01438         {
01439             if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
01440                 if (type == Decl_XML) {
01441                         fXMLVersion = XMLReader::XMLV1_1;
01442                     fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
01443                 }
01444                 else {
01445                     if (fXMLVersion != XMLReader::XMLV1_1)
01446                         emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
01447                 }
01448             }
01449             else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
01450                 if (type == Decl_XML) {
01451                         fXMLVersion = XMLReader::XMLV1_0;
01452                     fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);
01453                 }
01454             }
01455             else
01456                 emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
01457         }
01458          else if (curString == EncodingString)
01459         {
01460             if (!XMLString::isValidEncName(rawValue))
01461                 emitError(XMLErrs::BadXMLEncoding, rawValue);
01462         }
01463          else if (curString == StandaloneString)
01464         {
01465             if (XMLString::equals(rawValue, XMLUni::fgYesString))
01466                 fStandalone = true;
01467             else if (XMLString::equals(rawValue, XMLUni::fgNoString))
01468                 fStandalone = false;
01469             else
01470             {
01471                 emitError(XMLErrs::BadStandalone);
01472                 //if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
01473                 //else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
01474                 if (buffers[curString]->getLen() == 3 &&
01475                     (((rawValue[0] == chLatin_y) || (rawValue[0] == chLatin_Y)) &&
01476                      ((rawValue[1] == chLatin_e) || (rawValue[1] == chLatin_E)) &&
01477                      ((rawValue[2] == chLatin_s) || (rawValue[2] == chLatin_S))))
01478                     fStandalone = true;
01479                 else if (buffers[curString]->getLen() == 2 &&
01480                     (((rawValue[0] == chLatin_n) || (rawValue[0] == chLatin_N)) &&
01481                      ((rawValue[1] == chLatin_o) || (rawValue[1] == chLatin_O))))
01482                     fStandalone = false;
01483             }
01484         }
01485     }
01486 
01487     //  Make sure that the strings present are in order. We don't care about
01488     //  which ones are present at this point, just that any there are in the
01489     //  right order.
01490     int curTop = 0;
01491     for (int index = VersionString; index < StandaloneString; index++)
01492     {
01493         if (flags[index] != -1)
01494         {
01495             if (flags[index] !=  curTop + 1)
01496             {
01497                 emitError(XMLErrs::DeclStringsInWrongOrder);
01498                 break;
01499             }
01500             curTop = flags[index];
01501         }
01502     }
01503 
01504     //  If its an XML decl, the version must be present.
01505     //  If its a Text decl, then encoding must be present AND standalone must not be present.
01506     if ((type == Decl_XML) && (flags[VersionString] == -1))
01507         emitError(XMLErrs::XMLVersionRequired);
01508     else if (type == Decl_Text) {
01509         if (flags[StandaloneString] != -1)
01510             emitError(XMLErrs::StandaloneNotLegal);
01511         if (flags[EncodingString] == -1)
01512             emitError(XMLErrs::EncodingRequired);
01513     }
01514 
01515     if (!fReaderMgr.skippedChar(chQuestion))
01516     {
01517         emitError(XMLErrs::UnterminatedXMLDecl);
01518         fReaderMgr.skipPastChar(chCloseAngle);
01519     }
01520      else if (!fReaderMgr.skippedChar(chCloseAngle))
01521     {
01522         emitError(XMLErrs::UnterminatedXMLDecl);
01523         fReaderMgr.skipPastChar(chCloseAngle);
01524     }
01525 
01526     //  Do this before we possibly update the reader with the
01527     //  actual encoding string. Otherwise, we will pass the wrong thing
01528     //  for the last parameter!
01529     const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();
01530 
01531     //  Ok, we've now seen the real encoding string, if there was one, so
01532     //  lets call back on the current reader and tell it what the real
01533     //  encoding string was. If it fails, that's because it represents some
01534     //  sort of contradiction with the autosensed format, and it keeps the
01535     //  original encoding.
01536     //
01537     //  NOTE: This can fail for a number of reasons, such as a bogus encoding
01538     //  name or because its in flagrant contradiction of the auto-sensed
01539     //  format.
01540     if (flags[EncodingString] != -1)
01541     {
01542         if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
01543             emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
01544         else
01545             actualEnc = bbEncoding.getRawBuffer();
01546     }
01547 
01548     //  If we have a document handler then call the XML Decl callback.
01549     if (type == Decl_XML)
01550     {
01551         if (fDocHandler)
01552             fDocHandler->XMLDecl
01553             (
01554                 bbVersion.getRawBuffer()
01555                 , bbEncoding.getRawBuffer()
01556                 , bbStand.getRawBuffer()
01557                 , actualEnc
01558             );
01559     }
01560     else if (type == Decl_Text)
01561     {
01562         if (fDocTypeHandler)
01563             fDocTypeHandler->TextDecl
01564             (
01565                 bbVersion.getRawBuffer()
01566                 , bbEncoding.getRawBuffer()
01567             );
01568     }
01569 }
01570 
01571 const XMLCh* XMLScanner::getURIText(const   unsigned int    uriId) const
01572 {
01573     if (fURIStringPool->exists(uriId)) {
01574         // Look up the URI in the string pool and return its id
01575         const XMLCh* value = fURIStringPool->getValueForId(uriId);
01576         if (!value)
01577             return XMLUni::fgZeroLenString;
01578 
01579         return value;
01580     }
01581     else
01582         return XMLUni::fgZeroLenString;
01583 }
01584 
01585 bool XMLScanner::getURIText(  const   unsigned int    uriId
01586                       ,       XMLBuffer&      uriBufToFill) const
01587 {
01588     if (fURIStringPool->exists(uriId)) {
01589         // Look up the URI in the string pool and return its id
01590         const XMLCh* value = fURIStringPool->getValueForId(uriId);
01591         if (!value)
01592             return false;
01593 
01594         uriBufToFill.set(value);
01595         return true;
01596     }
01597     else
01598         return false;
01599 }
01600 
01601 bool XMLScanner::checkXMLDecl(bool startWithAngle) {
01602 
01603     // [23] XMLDecl     ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
01604     // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
01605     //
01606     // [3]  S           ::= (#x20 | #x9 | #xD | #xA)+
01607     if (startWithAngle) {
01608         if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
01609             if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
01610                || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
01611                || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
01612                || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
01613             {
01614                 return true;
01615             }
01616         }
01617         else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
01618            || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
01619            || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
01620            || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
01621         {
01622             //  Just in case, check for upper case. If found, issue
01623             //  an error, but keep going.
01624             emitError(XMLErrs::XMLDeclMustBeLowerCase);
01625             return true;
01626         }
01627     }
01628     else {
01629         if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
01630             if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
01631                || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
01632                || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
01633                || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
01634             {
01635                 return true;
01636             }
01637         }
01638         else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
01639            || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
01640            || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
01641            || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
01642         {
01643             //  Just in case, check for upper case. If found, issue
01644             //  an error, but keep going.
01645             emitError(XMLErrs::XMLDeclMustBeLowerCase);
01646             return true;
01647         }
01648     }
01649 
01650     return false;
01651 }
01652 
01653 
01654 // ---------------------------------------------------------------------------
01655 //  XMLScanner: Grammar preparsing
01656 // ---------------------------------------------------------------------------
01657 Grammar* XMLScanner::loadGrammar(const   XMLCh* const systemId
01658                                  , const short        grammarType
01659                                  , const bool         toCache)
01660 {
01661     InputSource* srcToUse = 0;
01662 
01663     if (fEntityHandler){
01664         ReaderMgr::LastExtEntityInfo lastInfo;
01665         fReaderMgr.getLastExtEntityInfo(lastInfo);
01666         XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
01667                             systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId,
01668                             &fReaderMgr);
01669         srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier);
01670     }
01671 
01672     //  First we try to parse it as a URL. If that fails, we assume its
01673     //  a file and try it that way.
01674     if (!srcToUse) {
01675         if (fDisableDefaultEntityResolution)
01676             return 0;
01677 
01678         try
01679         {
01680             //  Create a temporary URL. Since this is the primary document,
01681             //  it has to be fully qualified. If not, then assume we are just
01682             //  mistaking a file for a URL.
01683             XMLURL tmpURL(fMemoryManager);
01684 
01685             if (XMLURL::parse(systemId, tmpURL)) {
01686 
01687                 if (tmpURL.isRelative())
01688                 {
01689                     if (!fStandardUriConformant)
01690                         srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
01691                     else {
01692                         // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
01693                         // emit the error directly
01694                         MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
01695                         fInException = true;
01696                         emitError
01697                         (
01698                             XMLErrs::XMLException_Fatal
01699                             , e.getCode()
01700                             , e.getMessage()
01701                         );
01702                         return 0;
01703                     }
01704                 }
01705                 else
01706                 {
01707                     if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
01708                         MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
01709                         fInException = true;
01710                         emitError
01711                         (
01712                             XMLErrs::XMLException_Fatal
01713                             , e.getCode()
01714                             , e.getMessage()
01715                         );
01716                         return 0;
01717                     }
01718                     srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
01719                 }
01720             }
01721             else
01722             {
01723                 if (!fStandardUriConformant)
01724                     srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
01725                 else {
01726                     // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
01727                     // emit the error directly
01728                     // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
01729                     MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
01730                     fInException = true;
01731                     emitError
01732                     (
01733                         XMLErrs::XMLException_Fatal
01734                         , e.getCode()
01735                         , e.getMessage()
01736                     );
01737                     return 0;
01738                 }
01739             }
01740         }
01741         catch(const XMLException& excToCatch)
01742         {
01743             //  For any other XMLException,
01744             //  emit the error and catch any user exception thrown from here.
01745             fInException = true;
01746             if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
01747                 emitError
01748                 (
01749                     XMLErrs::XMLException_Warning
01750                     , excToCatch.getCode()
01751                     , excToCatch.getMessage()
01752                 );
01753             else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
01754                 emitError
01755                 (
01756                     XMLErrs::XMLException_Fatal
01757                     , excToCatch.getCode()
01758                     , excToCatch.getMessage()
01759                 );
01760             else
01761                 emitError
01762                 (
01763                     XMLErrs::XMLException_Error
01764                     , excToCatch.getCode()
01765                     , excToCatch.getMessage()
01766                 );
01767                 return 0;
01768         }
01769     }
01770 
01771     Janitor<InputSource> janSrc(srcToUse);
01772     return loadGrammar(*srcToUse, grammarType, toCache);
01773 }
01774 
01775 Grammar* XMLScanner::loadGrammar(const   char* const systemId
01776                                  , const short       grammarType
01777                                  , const bool        toCache)
01778 {
01779     // We just delegate this to the XMLCh version after transcoding
01780     XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
01781     ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
01782     return loadGrammar(tmpBuf, grammarType, toCache);
01783 }
01784 
01785 
01786 // ---------------------------------------------------------------------------
01787 //  XMLScanner: Setter methods
01788 // ---------------------------------------------------------------------------
01789 void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
01790 {
01791     fURIStringPool = stringPool;
01792     fEmptyNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
01793     fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
01794     fXMLNamespaceId     = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
01795     fXMLNSNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
01796 }
01797 
01798 // ---------------------------------------------------------------------------
01799 //  XMLScanner: Private helper methods
01800 // ---------------------------------------------------------------------------
01801 
01802 /***
01803  * In reusing grammars (cacheing grammar from parse, or use cached grammar), internal
01804  * dtd is allowed conditionally.
01805  *
01806  * In the case of cacheing grammar from parse, it is NOT allowed.
01807  *
01808  * In the case of use cached grammar,
01809  *   if external dtd is present and it is parsed before, then it is not allowed,
01810  *   otherwise it is allowed.
01811  *
01812  ***/
01813 void XMLScanner::checkInternalDTD(bool hasExtSubset
01814                                  ,const XMLCh* const sysId
01815                                  ,const XMLCh* const pubId)
01816 {
01817     if (fToCacheGrammar)
01818         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
01819 
01820     if (fUseCachedGrammar && hasExtSubset && !fIgnoreCachedDTD)
01821     {
01822         InputSource* sysIdSrc = resolveSystemId(sysId, pubId);
01823         if (sysIdSrc) {
01824             Janitor<InputSource> janSysIdSrc(sysIdSrc);
01825             Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId());
01826 
01827             if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType)
01828             {
01829                 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
01830             }
01831         }
01832     }
01833 
01834 }
01835 
01836 //  This method is called after the content scan to insure that all the
01837 //  ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
01838 //  an XML 1.0 rule, so we can do here in the core.
01839 
01840 void XMLScanner::checkIDRefs()
01841 {
01842     //  Iterate the id ref list. If we find any entries here which are used
01843     //  but not declared, then that's an error.
01844     RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager);
01845     while (refEnum.hasMoreElements())
01846     {
01847         // Get a ref to the current element
01848         const XMLRefInfo& curRef = refEnum.nextElement();
01849 
01850         // If its used but not declared, then its an error
01851         if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
01852             fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
01853     }
01854 }
01855 
01856 
01857 //  This just does a simple check that the passed progressive scan token is
01858 //  legal for this scanner.
01859 bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
01860 {
01861     return ((fScannerId == toCheck.fScannerId)
01862     &&      (fSequenceId == toCheck.fSequenceId));
01863 }
01864 
01865 
01866 //  This method will handle figuring out what the next top level token is
01867 //  in the input stream. It will return an enumerated value that indicates
01868 //  what it believes the next XML level token must be. It will eat as many
01869 //  chars are required to figure out what is next.
01870 XMLScanner::XMLTokens XMLScanner::senseNextToken(XMLSize_t& orgReader)
01871 {
01872     //  Get the next character and use it to guesstimate what the next token
01873     //  is going to be. We turn on end of entity exceptions when we do this
01874     //  in order to catch the scenario where the current entity ended at
01875     //  the > of some markup.
01876     XMLCh nextCh=0;
01877 
01878     XMLReader* curReader=fReaderMgr.getCurrentReader();
01879     // avoid setting up the ThrowEOEJanitor if we know that we have data in the current reader
01880     if(curReader && curReader->charsLeftInBuffer()>0)
01881         nextCh = fReaderMgr.peekNextChar();
01882     else
01883     {
01884         ThrowEOEJanitor janMgr(&fReaderMgr, true);
01885         nextCh = fReaderMgr.peekNextChar();
01886     }
01887 
01888     //  If it's not a '<' we must be in content (unless it's a EOF)
01889     //
01890     //  This includes entity references '&' of some sort. These must
01891     //  be character data because that's the only place a reference can
01892     //  occur in content.
01893     if (nextCh != chOpenAngle)
01894         return nextCh?Token_CharData:Token_EOF;
01895 
01896     //  Ok it had to have been a '<' character. So get it out of the reader
01897     //  and store the reader number where we saw it, passing it back to the
01898     //  caller.
01899     fReaderMgr.getNextChar();
01900     orgReader = fReaderMgr.getCurrentReaderNum();
01901 
01902     //  Ok, so lets go through the things that it could be at this point which
01903     //  are all some form of markup.
01904     switch(fReaderMgr.peekNextChar())
01905     {
01906     case chForwardSlash:
01907         {
01908             fReaderMgr.getNextChar();
01909             return Token_EndTag;
01910         }
01911     case chBang:
01912         {
01913             static const XMLCh gCDATAStr[] =
01914             {
01915                     chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
01916                 ,   chLatin_T, chLatin_A, chNull
01917             };
01918 
01919             static const XMLCh gCommentString[] =
01920             {
01921                 chBang, chDash, chDash, chNull
01922             };
01923 
01924             if (fReaderMgr.skippedString(gCDATAStr))
01925                 return Token_CData;
01926 
01927             if (fReaderMgr.skippedString(gCommentString))
01928                 return Token_Comment;
01929 
01930             emitError(XMLErrs::ExpectedCommentOrCDATA);
01931             return Token_Unknown;
01932         }
01933     case chQuestion:
01934         {
01935             // It must be a PI
01936             fReaderMgr.getNextChar();
01937             return Token_PI;
01938         }
01939     }
01940     //  Assume its an element name, so return with a start tag token. If it
01941     //  turns out not to be, then it will fail when it cannot get a valid tag.
01942     return Token_StartTag;
01943 }
01944 
01945 // ---------------------------------------------------------------------------
01946 //  XMLScanner: Private parsing methods
01947 // ---------------------------------------------------------------------------
01948 
01949 //  This guy just scans out a single or double quoted string of characters.
01950 //  It does not pass any judgement on the contents and assumes that it is
01951 //  illegal to have another quote of the same kind inside the string's
01952 //  contents.
01953 //
01954 //  NOTE: This is for simple stuff like the strings in the XMLDecl which
01955 //  cannot have any entities inside them. So this guy does not handle any
01956 //  end of entity stuff.
01957 bool XMLScanner::getQuotedString(XMLBuffer& toFill)
01958 {
01959     // Reset the target buffer
01960     toFill.reset();
01961 
01962     // Get the next char which must be a single or double quote
01963     XMLCh quoteCh;
01964     if (!fReaderMgr.skipIfQuote(quoteCh))
01965         return false;
01966 
01967         XMLCh nextCh;
01968     // Get another char and see if it matches the starting quote char
01969     while ((nextCh=fReaderMgr.getNextChar())!=quoteCh)
01970     {
01971         //  We should never get either an end of file null char here. If we
01972         //  do, just fail. It will be handled more gracefully in the higher
01973         //  level code that called us.
01974         if (!nextCh)
01975             return false;
01976 
01977         // Else add it to the buffer
01978         toFill.append(nextCh);
01979     }
01980     return true;
01981 }
01982 
01983 
01984 //  This method scans a character reference and returns the character that
01985 //  was refered to. It assumes that we've already scanned the &# characters
01986 //  that prefix the numeric code.
01987 bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
01988 {
01989     bool gotOne = false;
01990     unsigned int value = 0;
01991 
01992     //  Set the radix. Its supposed to be a lower case x if hex. But, in
01993     //  order to recover well, we check for an upper and put out an error
01994     //  for that.
01995     unsigned int radix = 10;
01996     if (fReaderMgr.skippedChar(chLatin_x))
01997     {
01998         radix = 16;
01999     }
02000     else if (fReaderMgr.skippedChar(chLatin_X))
02001     {
02002         emitError(XMLErrs::HexRadixMustBeLowerCase);
02003         radix = 16;
02004     }
02005 
02006     while (true)
02007     {
02008         const XMLCh nextCh = fReaderMgr.peekNextChar();
02009 
02010         // Watch for EOF
02011         if (!nextCh)
02012             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
02013 
02014         // Break out on the terminating semicolon
02015         if (nextCh == chSemiColon)
02016         {
02017             fReaderMgr.getNextChar();
02018             break;
02019         }
02020 
02021         //  Convert this char to a binary value, or bail out if its not
02022         //  one.
02023         unsigned int nextVal;
02024         if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
02025             nextVal = (unsigned int)(nextCh - chDigit_0);
02026         else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
02027             nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
02028         else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
02029             nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
02030         else
02031         {
02032             // Return a zero
02033             toFill = 0;
02034 
02035             //  If we got at least a sigit, then do an unterminated ref error.
02036             //  Else, do an expected a numerical ref thing.
02037             if (gotOne)
02038                 emitError(XMLErrs::UnterminatedCharRef);
02039             else
02040                 emitError(XMLErrs::ExpectedNumericalCharRef);
02041 
02042             // Return failure
02043             return false;
02044         }
02045 
02046         //  Make sure its valid for the radix. If not, then just eat the
02047         //  digit and go on after issueing an error. Else, update the
02048         //  running value with this new digit.
02049         if (nextVal >= radix)
02050         {
02051             XMLCh tmpStr[2];
02052             tmpStr[0] = nextCh;
02053             tmpStr[1] = chNull;
02054             emitError(XMLErrs::BadDigitForRadix, tmpStr);
02055         }
02056         else
02057         {
02058             value = (value * radix) + nextVal;
02059             // Guard against overflow.
02060             if (value > 0x10FFFF) {
02061                 // Character reference was not in the valid range
02062                 emitError(XMLErrs::InvalidCharacterRef);
02063                 return false;
02064             }
02065         }
02066 
02067         // Indicate that we got at least one good digit
02068         gotOne = true;
02069 
02070         // And eat the last char
02071         fReaderMgr.getNextChar();
02072     }
02073 
02074     // Return the char (or chars)
02075     // And check if the character expanded is valid or not
02076     if (value >= 0x10000 && value <= 0x10FFFF)
02077     {
02078         value -= 0x10000;
02079         toFill = XMLCh((value >> 10) + 0xD800);
02080         second = XMLCh((value & 0x3FF) + 0xDC00);
02081     }
02082     else if (value <= 0xFFFD)
02083     {
02084         toFill = XMLCh(value);
02085         second = 0;
02086         if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
02087             // Character reference was not in the valid range
02088             emitError(XMLErrs::InvalidCharacterRef);
02089             return false;
02090         }
02091     }
02092     else {
02093         // Character reference was not in the valid range
02094         emitError(XMLErrs::InvalidCharacterRef);
02095         return false;
02096     }
02097 
02098     return true;
02099 }
02100 
02101 
02102 //  We get here after the '<!--' part of the comment. We scan past the
02103 //  terminating '-->' It will calls the appropriate handler with the comment
02104 //  text, if one is provided. A comment can be in either the document or
02105 //  the DTD, so the fInDocument flag is used to know which handler to send
02106 //  it to.
02107 void XMLScanner::scanComment()
02108 {
02109 
02110     enum States
02111     {
02112         InText
02113         , OneDash
02114         , TwoDashes
02115     };
02116 
02117     // Get a buffer for this
02118     XMLBufBid bbComment(&fBufMgr);
02119 
02120     //  Get the comment text into a temp buffer. Be sure to use temp buffer
02121     //  two here, since its to be used for stuff that is potentially longer
02122     //  than just a name.
02123     States curState = InText;
02124     bool gotLeadingSurrogate = false;
02125     while (true)
02126     {
02127         // Get the next character
02128         const XMLCh nextCh = fReaderMgr.getNextChar();
02129 
02130         //  Watch for an end of file
02131         if (!nextCh)
02132         {
02133             emitError(XMLErrs::UnterminatedComment);
02134             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
02135         }
02136 
02137         // Check for correct surrogate pairs
02138         if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
02139         {
02140             if (gotLeadingSurrogate)
02141                 emitError(XMLErrs::Expected2ndSurrogateChar);
02142             else
02143                 gotLeadingSurrogate = true;
02144         }
02145         else
02146         {
02147             if (gotLeadingSurrogate)
02148             {
02149                 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
02150                     emitError(XMLErrs::Expected2ndSurrogateChar);
02151             }
02152             // Its got to at least be a valid XML character
02153             else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {
02154 
02155                 XMLCh tmpBuf[9];
02156                 XMLString::binToText
02157                 (
02158                     nextCh
02159                     , tmpBuf
02160                     , 8
02161                     , 16
02162                     , fMemoryManager
02163                 );
02164                 emitError(XMLErrs::InvalidCharacter, tmpBuf);
02165             }
02166 
02167             gotLeadingSurrogate = false;
02168         }
02169 
02170         if (curState == InText)
02171         {
02172             // If its a dash, go to OneDash state. Otherwise take as text
02173             if (nextCh == chDash)
02174                 curState = OneDash;
02175             else
02176                 bbComment.append(nextCh);
02177         }
02178         else if (curState == OneDash)
02179         {
02180             //  If its another dash, then we change to the two dashes states.
02181             //  Otherwise, we have to put in the deficit dash and the new
02182             //  character and go back to InText.
02183             if (nextCh == chDash)
02184             {
02185                 curState = TwoDashes;
02186             }
02187             else
02188             {
02189                 bbComment.append(chDash);
02190                 bbComment.append(nextCh);
02191                 curState = InText;
02192             }
02193         }
02194         else if (curState == TwoDashes)
02195         {
02196             // The next character must be the closing bracket
02197             if (nextCh != chCloseAngle)
02198             {
02199                 emitError(XMLErrs::IllegalSequenceInComment);
02200                 fReaderMgr.skipPastChar(chCloseAngle);
02201                 return;
02202             }
02203             break;
02204         }
02205     }
02206 
02207     // If we have an available handler, call back with the comment.
02208     if (fDocHandler)
02209     {
02210         fDocHandler->docComment
02211         (
02212             bbComment.getRawBuffer()
02213         );
02214     }
02215 
02216     //mark comment is seen within the current element
02217     if (! fElemStack.isEmpty())
02218         fElemStack.setCommentOrPISeen();
02219 
02220 }
02221 
02222 
02223 //  Most equal signs can have white space around them, so this little guy
02224 //  just makes the calling code cleaner by eating whitespace.
02225 bool XMLScanner::scanEq(bool inDecl)
02226 {
02227     if(inDecl)
02228     {
02229         bool skippedSomething;
02230         fReaderMgr.skipPastSpaces(skippedSomething, inDecl);
02231         if (fReaderMgr.skippedChar(chEqual))
02232         {
02233             fReaderMgr.skipPastSpaces(skippedSomething, inDecl);
02234             return true;
02235         }
02236     }
02237     else
02238     {
02239         fReaderMgr.skipPastSpaces();
02240         if (fReaderMgr.skippedChar(chEqual))
02241         {
02242             fReaderMgr.skipPastSpaces();
02243             return true;
02244         }
02245     }
02246     return false;
02247 }
02248 
02249 
02250 XMLSize_t
02251 XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
02252 {
02253     fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
02254     return toFill.getLen();
02255 }
02256 
02257 unsigned int *XMLScanner::getNewUIntPtr()
02258 {
02259     // this method hands back a new pointer initialized to 0
02260     unsigned int *retVal;
02261     if(fUIntPoolCol < 64)
02262     {
02263         retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol;
02264         fUIntPoolCol++;
02265         return retVal;
02266     }
02267     // time to grow the pool...
02268     if(fUIntPoolRow+1 == fUIntPoolRowTotal)
02269     {
02270         // and time to add some space for new rows:
02271         fUIntPoolRowTotal <<= 1;
02272         unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal );
02273         memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *));
02274         fMemoryManager->deallocate(fUIntPool);
02275         fUIntPool = newArray;
02276         // need to 0 out new elements we won't need:
02277         for (unsigned int i=fUIntPoolRow+2; i<fUIntPoolRowTotal; i++)
02278             fUIntPool[i] = 0;
02279     }
02280     // now to add a new row; we just made sure we have space
02281     fUIntPoolRow++;
02282     fUIntPool[fUIntPoolRow] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
02283     memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
02284     // point to next element
02285     fUIntPoolCol = 1;
02286     return fUIntPool[fUIntPoolRow];
02287 }
02288 
02289 void XMLScanner::resetUIntPool()
02290 {
02291     // to reuse the unsigned int pool--and the hashtables that use it--
02292     // simply reinitialize everything to 0's
02293     for(unsigned int i = 0; i<= fUIntPoolRow; i++)
02294         memset(fUIntPool[i], 0, sizeof(unsigned int) << 6);
02295 }
02296 
02297 void XMLScanner::recreateUIntPool()
02298 {
02299     // this allows a bloated unsigned int pool to be dispensed with
02300 
02301     // first, delete old fUIntPool
02302     for (unsigned int i=0; i<=fUIntPoolRow; i++)
02303     {
02304         fMemoryManager->deallocate(fUIntPool[i]);
02305     }
02306     fMemoryManager->deallocate(fUIntPool);
02307 
02308     fUIntPoolRow = fUIntPoolCol = 0;
02309     fUIntPoolRowTotal = 2;
02310     fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal);
02311     fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
02312     memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
02313     fUIntPool[1] = 0;
02314 }
02315 
02316 unsigned int XMLScanner::resolvePrefix(  const XMLCh* const        prefix
02317                                        , const ElemStack::MapModes mode)
02318 {
02319     //
02320     //  If the prefix is empty, and we are in attribute mode, then we assign
02321     //  it to the empty namespace because the default namespace does not
02322     //  apply to attributes.
02323     //
02324     if (!*prefix)
02325     {
02326         if(mode == ElemStack::Mode_Attribute)
02327             return fEmptyNamespaceId;
02328     }
02329     //  Watch for the special namespace prefixes. We always map these to
02330     //  special URIs. 'xml' gets mapped to the official URI that its defined
02331     //  to map to by the NS spec. xmlns gets mapped to a special place holder
02332     //  URI that we define (so that it maps to something checkable.)
02333     else
02334     {
02335         if (XMLString::equals(prefix, XMLUni::fgXMLNSString))
02336             return fXMLNSNamespaceId;
02337         else if (XMLString::equals(prefix, XMLUni::fgXMLString))
02338             return fXMLNamespaceId;
02339     }
02340 
02341     //  Ask the element stack to search up itself for a mapping for the
02342     //  passed prefix.
02343     bool unknown;
02344     unsigned int uriId = fElemStack.mapPrefixToURI(prefix, unknown);
02345 
02346     // If it was unknown, then the URI was faked in but we have to issue an error
02347     if (unknown)
02348         emitError(XMLErrs::UnknownPrefix, prefix);
02349 
02350     // check to see if uriId is empty; in XML 1.1 an emptynamespace is okay unless
02351     // we are trying to use it.
02352     if (*prefix &&
02353         mode == ElemStack::Mode_Element &&
02354         fXMLVersion != XMLReader::XMLV1_0 &&
02355         uriId == fElemStack.getEmptyNamespaceId())
02356         emitError(XMLErrs::UnknownPrefix, prefix);
02357 
02358     return uriId;
02359 }
02360 
02361 unsigned int
02362 XMLScanner::resolveQName(  const XMLCh* const           qName
02363                          ,       XMLBuffer&             prefixBuf
02364                          , const ElemStack::MapModes    mode
02365                          ,       int&                   prefixColonPos)
02366 {
02367     prefixColonPos = XMLString::indexOf(qName, chColon);
02368     return resolveQNameWithColon(qName, prefixBuf, mode, prefixColonPos);
02369 }
02370 
02371 unsigned int
02372 XMLScanner::resolveQNameWithColon(  const XMLCh* const          qName
02373                                   ,       XMLBuffer&            prefixBuf
02374                                   , const ElemStack::MapModes   mode
02375                                   , const int                   prefixColonPos)
02376 {
02377     //  Lets split out the qName into a URI and name buffer first. The URI
02378     //  can be empty.
02379     if (prefixColonPos == -1)
02380     {
02381         //  Its all name with no prefix, so put the whole thing into the name
02382         //  buffer. Then map the empty string to a URI, since the empty string
02383         //  represents the default namespace. This will either return some
02384         //  explicit URI which the default namespace is mapped to, or the
02385         //  the default global namespace.
02386         prefixBuf.reset();
02387         return resolvePrefix(XMLUni::fgZeroLenString, mode);
02388     }
02389     else
02390     {
02391         //  Copy the chars up to but not including the colon into the prefix
02392         //  buffer.
02393         prefixBuf.set(qName, prefixColonPos);
02394         return resolvePrefix(prefixBuf.getRawBuffer(), mode);
02395     }
02396 }
02397 
02398 XERCES_CPP_NAMESPACE_END