GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: XMLScanner.cpp 882548 2009-11-20 13:44:14Z borisk $ 00020 */ 00021 00022 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/internal/XMLScanner.hpp> 00027 #include <xercesc/internal/ValidationContextImpl.hpp> 00028 #include <xercesc/util/Janitor.hpp> 00029 #include <xercesc/util/Mutexes.hpp> 00030 #include <xercesc/util/RuntimeException.hpp> 00031 #include <xercesc/util/UnexpectedEOFException.hpp> 00032 #include <xercesc/util/XMLMsgLoader.hpp> 00033 #include <xercesc/util/XMLInitializer.hpp> 00034 #include <xercesc/framework/LocalFileInputSource.hpp> 00035 #include <xercesc/framework/URLInputSource.hpp> 00036 #include <xercesc/framework/XMLDocumentHandler.hpp> 00037 #include <xercesc/framework/XMLEntityHandler.hpp> 00038 #include <xercesc/framework/XMLPScanToken.hpp> 00039 #include <xercesc/framework/XMLValidator.hpp> 00040 #include <xercesc/internal/EndOfEntityException.hpp> 00041 #include <xercesc/validators/DTD/DocTypeHandler.hpp> 00042 #include <xercesc/validators/common/GrammarResolver.hpp> 00043 #include <xercesc/util/OutOfMemoryException.hpp> 00044 #include <xercesc/util/XMLResourceIdentifier.hpp> 00045 00046 XERCES_CPP_NAMESPACE_BEGIN 00047 00048 // --------------------------------------------------------------------------- 00049 // Local static data 00050 // --------------------------------------------------------------------------- 00051 static XMLUInt32 gScannerId = 0; 00052 static XMLMutex* sScannerMutex = 0; 00053 static XMLMsgLoader* gMsgLoader = 0; 00054 00055 void XMLInitializer::initializeXMLScanner() 00056 { 00057 gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain); 00058 00059 if (!gMsgLoader) 00060 XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain); 00061 00062 sScannerMutex = new XMLMutex(XMLPlatformUtils::fgMemoryManager); 00063 } 00064 00065 void XMLInitializer::terminateXMLScanner() 00066 { 00067 delete gMsgLoader; 00068 gMsgLoader = 0; 00069 00070 delete sScannerMutex; 00071 sScannerMutex = 0; 00072 } 00073 00074 // 00075 // 00076 typedef JanitorMemFunCall<XMLScanner> CleanupType; 00077 typedef JanitorMemFunCall<ReaderMgr> ReaderMgrResetType; 00078 00079 00080 // --------------------------------------------------------------------------- 00081 // XMLScanner: Constructors and Destructor 00082 // --------------------------------------------------------------------------- 00083 XMLScanner::XMLScanner(XMLValidator* const valToAdopt, 00084 GrammarResolver* const grammarResolver, 00085 MemoryManager* const manager) 00086 : fBufferSize(1024 * 1024) 00087 , fLowWaterMark (100) 00088 , fStandardUriConformant(false) 00089 , fCalculateSrcOfs(false) 00090 , fDoNamespaces(false) 00091 , fExitOnFirstFatal(true) 00092 , fValidationConstraintFatal(false) 00093 , fInException(false) 00094 , fStandalone(false) 00095 , fHasNoDTD(true) 00096 , fValidate(false) 00097 , fValidatorFromUser(false) 00098 , fDoSchema(false) 00099 , fSchemaFullChecking(false) 00100 , fIdentityConstraintChecking(true) 00101 , fToCacheGrammar(false) 00102 , fUseCachedGrammar(false) 00103 , fLoadExternalDTD(true) 00104 , fLoadSchema(true) 00105 , fNormalizeData(true) 00106 , fGenerateSyntheticAnnotations(false) 00107 , fValidateAnnotations(false) 00108 , fIgnoreCachedDTD(false) 00109 , fIgnoreAnnotations(false) 00110 , fDisableDefaultEntityResolution(false) 00111 , fSkipDTDValidation(false) 00112 , fHandleMultipleImports(false) 00113 , fErrorCount(0) 00114 , fEntityExpansionLimit(0) 00115 , fEntityExpansionCount(0) 00116 , fEmptyNamespaceId(0) 00117 , fUnknownNamespaceId(0) 00118 , fXMLNamespaceId(0) 00119 , fXMLNSNamespaceId(0) 00120 , fSchemaNamespaceId(0) 00121 , fUIntPool(0) 00122 , fUIntPoolRow(0) 00123 , fUIntPoolCol(0) 00124 , fUIntPoolRowTotal(2) 00125 , fScannerId(0) 00126 , fSequenceId(0) 00127 , fAttrList(0) 00128 , fAttrDupChkRegistry(0) 00129 , fDocHandler(0) 00130 , fDocTypeHandler(0) 00131 , fEntityHandler(0) 00132 , fErrorReporter(0) 00133 , fErrorHandler(0) 00134 , fPSVIHandler(0) 00135 , fValidationContext(0) 00136 , fEntityDeclPoolRetrieved(false) 00137 , fReaderMgr(manager) 00138 , fValidator(valToAdopt) 00139 , fValScheme(Val_Never) 00140 , fGrammarResolver(grammarResolver) 00141 , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager()) 00142 , fGrammar(0) 00143 , fRootGrammar(0) 00144 , fURIStringPool(0) 00145 , fRootElemName(0) 00146 , fExternalSchemaLocation(0) 00147 , fExternalNoNamespaceSchemaLocation(0) 00148 , fSecurityManager(0) 00149 , fXMLVersion(XMLReader::XMLV1_0) 00150 , fMemoryManager(manager) 00151 , fBufMgr(manager) 00152 , fAttNameBuf(1023, manager) 00153 , fAttValueBuf(1023, manager) 00154 , fCDataBuf(1023, manager) 00155 , fQNameBuf(1023, manager) 00156 , fPrefixBuf(1023, manager) 00157 , fURIBuf(1023, manager) 00158 , fWSNormalizeBuf(1023, manager) 00159 , fElemStack(manager) 00160 { 00161 CleanupType cleanup(this, &XMLScanner::cleanUp); 00162 00163 try 00164 { 00165 commonInit(); 00166 } 00167 catch(const OutOfMemoryException&) 00168 { 00169 // Don't cleanup when out of memory, since executing the 00170 // code can cause problems. 00171 cleanup.release(); 00172 00173 throw; 00174 } 00175 00176 cleanup.release(); 00177 } 00178 00179 XMLScanner::XMLScanner( XMLDocumentHandler* const docHandler 00180 , DocTypeHandler* const docTypeHandler 00181 , XMLEntityHandler* const entityHandler 00182 , XMLErrorReporter* const errHandler 00183 , XMLValidator* const valToAdopt 00184 , GrammarResolver* const grammarResolver 00185 , MemoryManager* const manager) 00186 00187 : fBufferSize(1024 * 1024) 00188 , fLowWaterMark (100) 00189 , fStandardUriConformant(false) 00190 , fCalculateSrcOfs(false) 00191 , fDoNamespaces(false) 00192 , fExitOnFirstFatal(true) 00193 , fValidationConstraintFatal(false) 00194 , fInException(false) 00195 , fStandalone(false) 00196 , fHasNoDTD(true) 00197 , fValidate(false) 00198 , fValidatorFromUser(false) 00199 , fDoSchema(false) 00200 , fSchemaFullChecking(false) 00201 , fIdentityConstraintChecking(true) 00202 , fToCacheGrammar(false) 00203 , fUseCachedGrammar(false) 00204 , fLoadExternalDTD(true) 00205 , fLoadSchema(true) 00206 , fNormalizeData(true) 00207 , fGenerateSyntheticAnnotations(false) 00208 , fValidateAnnotations(false) 00209 , fIgnoreCachedDTD(false) 00210 , fIgnoreAnnotations(false) 00211 , fDisableDefaultEntityResolution(false) 00212 , fSkipDTDValidation(false) 00213 , fHandleMultipleImports(false) 00214 , fErrorCount(0) 00215 , fEntityExpansionLimit(0) 00216 , fEntityExpansionCount(0) 00217 , fEmptyNamespaceId(0) 00218 , fUnknownNamespaceId(0) 00219 , fXMLNamespaceId(0) 00220 , fXMLNSNamespaceId(0) 00221 , fSchemaNamespaceId(0) 00222 , fUIntPool(0) 00223 , fUIntPoolRow(0) 00224 , fUIntPoolCol(0) 00225 , fUIntPoolRowTotal(2) 00226 , fScannerId(0) 00227 , fSequenceId(0) 00228 , fAttrList(0) 00229 , fAttrDupChkRegistry(0) 00230 , fDocHandler(docHandler) 00231 , fDocTypeHandler(docTypeHandler) 00232 , fEntityHandler(entityHandler) 00233 , fErrorReporter(errHandler) 00234 , fErrorHandler(0) 00235 , fPSVIHandler(0) 00236 , fValidationContext(0) 00237 , fEntityDeclPoolRetrieved(false) 00238 , fReaderMgr(manager) 00239 , fValidator(valToAdopt) 00240 , fValScheme(Val_Never) 00241 , fGrammarResolver(grammarResolver) 00242 , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager()) 00243 , fGrammar(0) 00244 , fRootGrammar(0) 00245 , fURIStringPool(0) 00246 , fRootElemName(0) 00247 , fExternalSchemaLocation(0) 00248 , fExternalNoNamespaceSchemaLocation(0) 00249 , fSecurityManager(0) 00250 , fXMLVersion(XMLReader::XMLV1_0) 00251 , fMemoryManager(manager) 00252 , fBufMgr(manager) 00253 , fAttNameBuf(1023, manager) 00254 , fAttValueBuf(1023, manager) 00255 , fCDataBuf(1023, manager) 00256 , fQNameBuf(1023, manager) 00257 , fPrefixBuf(1023, manager) 00258 , fURIBuf(1023, manager) 00259 , fWSNormalizeBuf(1023, manager) 00260 , fElemStack(manager) 00261 { 00262 CleanupType cleanup(this, &XMLScanner::cleanUp); 00263 00264 try 00265 { 00266 commonInit(); 00267 } 00268 catch(const OutOfMemoryException&) 00269 { 00270 // Don't cleanup when out of memory, since executing the 00271 // code can cause problems. 00272 cleanup.release(); 00273 00274 throw; 00275 } 00276 00277 cleanup.release(); 00278 } 00279 00280 XMLScanner::~XMLScanner() 00281 { 00282 cleanUp(); 00283 } 00284 00285 void XMLScanner::resetCachedGrammar () 00286 { 00287 } 00288 00289 void XMLScanner::setValidator(XMLValidator* const valToAdopt) 00290 { 00291 if (fValidatorFromUser) 00292 delete fValidator; 00293 fValidator = valToAdopt; 00294 fValidatorFromUser = true; 00295 initValidator(fValidator); 00296 } 00297 00298 00299 00300 // --------------------------------------------------------------------------- 00301 // XMLScanner: Main entry point to scan a document 00302 // --------------------------------------------------------------------------- 00303 void XMLScanner::scanDocument( const XMLCh* const systemId) 00304 { 00305 // First we try to parse it as a URL. If that fails, we assume its 00306 // a file and try it that way. 00307 InputSource* srcToUse = 0; 00308 try 00309 { 00310 // Create a temporary URL. Since this is the primary document, 00311 // it has to be fully qualified. If not, then assume we are just 00312 // mistaking a file for a URL. 00313 XMLURL tmpURL(fMemoryManager); 00314 00315 if (XMLURL::parse(systemId, tmpURL)) { 00316 00317 if (tmpURL.isRelative()) { 00318 if (!fStandardUriConformant) 00319 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 00320 else { 00321 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 00322 // emit the error directly 00323 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager); 00324 fInException = true; 00325 emitError 00326 ( 00327 XMLErrs::XMLException_Fatal 00328 , e.getCode() 00329 , e.getMessage() 00330 ); 00331 return; 00332 } 00333 } 00334 else 00335 { 00336 if (fStandardUriConformant && tmpURL.hasInvalidChar()) { 00337 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager); 00338 fInException = true; 00339 emitError 00340 ( 00341 XMLErrs::XMLException_Fatal 00342 , e.getCode() 00343 , e.getMessage() 00344 ); 00345 return; 00346 } 00347 srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager); 00348 } 00349 } 00350 else { 00351 00352 if (!fStandardUriConformant) 00353 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 00354 else { 00355 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 00356 // emit the error directly 00357 // lazy bypass ... since all MalformedURLException are fatal, no need to check the type 00358 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager); 00359 fInException = true; 00360 emitError 00361 ( 00362 XMLErrs::XMLException_Fatal 00363 , e.getCode() 00364 , e.getMessage() 00365 ); 00366 return; 00367 } 00368 } 00369 } 00370 catch(const XMLException& excToCatch) 00371 { 00372 // For any other XMLException, 00373 // emit the error and catch any user exception thrown from here. 00374 fInException = true; 00375 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00376 emitError 00377 ( 00378 XMLErrs::XMLException_Warning 00379 , excToCatch.getCode() 00380 , excToCatch.getMessage() 00381 ); 00382 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00383 emitError 00384 ( 00385 XMLErrs::XMLException_Fatal 00386 , excToCatch.getCode() 00387 , excToCatch.getMessage() 00388 ); 00389 else 00390 emitError 00391 ( 00392 XMLErrs::XMLException_Error 00393 , excToCatch.getCode() 00394 , excToCatch.getMessage() 00395 ); 00396 return; 00397 } 00398 00399 Janitor<InputSource> janSrc(srcToUse); 00400 scanDocument(*srcToUse); 00401 } 00402 00403 void XMLScanner::scanDocument( const char* const systemId) 00404 { 00405 // We just delegate this to the XMLCh version after transcoding 00406 XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager); 00407 ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); 00408 scanDocument(tmpBuf); 00409 } 00410 00411 00412 // This method begins a progressive parse. It scans through the prolog and 00413 // returns a token to be used on subsequent scanNext() calls. If the return 00414 // value is true, then the token is legal and ready for further use. If it 00415 // returns false, then the scan of the prolog failed and the token is not 00416 // going to work on subsequent scanNext() calls. 00417 bool XMLScanner::scanFirst( const XMLCh* const systemId 00418 , XMLPScanToken& toFill) 00419 { 00420 // First we try to parse it as a URL. If that fails, we assume its 00421 // a file and try it that way. 00422 InputSource* srcToUse = 0; 00423 try 00424 { 00425 // Create a temporary URL. Since this is the primary document, 00426 // it has to be fully qualified. If not, then assume we are just 00427 // mistaking a file for a URL. 00428 XMLURL tmpURL(fMemoryManager); 00429 if (XMLURL::parse(systemId, tmpURL)) { 00430 if (tmpURL.isRelative()) { 00431 if (!fStandardUriConformant) 00432 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 00433 else { 00434 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 00435 // emit the error directly 00436 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager); 00437 fInException = true; 00438 emitError 00439 ( 00440 XMLErrs::XMLException_Fatal 00441 , e.getCode() 00442 , e.getMessage() 00443 ); 00444 return false; 00445 } 00446 } 00447 else 00448 { 00449 if (fStandardUriConformant && tmpURL.hasInvalidChar()) { 00450 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager); 00451 fInException = true; 00452 emitError 00453 ( 00454 XMLErrs::XMLException_Fatal 00455 , e.getCode() 00456 , e.getMessage() 00457 ); 00458 return false; 00459 } 00460 srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager); 00461 } 00462 } 00463 else { 00464 if (!fStandardUriConformant) 00465 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 00466 else { 00467 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 00468 // emit the error directly 00469 // lazy bypass ... since all MalformedURLException are fatal, no need to check the type 00470 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL); 00471 fInException = true; 00472 emitError 00473 ( 00474 XMLErrs::XMLException_Fatal 00475 , e.getCode() 00476 , e.getMessage() 00477 ); 00478 return false; 00479 } 00480 } 00481 } 00482 catch(const XMLException& excToCatch) 00483 { 00484 // For any other XMLException, 00485 // emit the error and catch any user exception thrown from here. 00486 fInException = true; 00487 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00488 emitError 00489 ( 00490 XMLErrs::XMLException_Warning 00491 , excToCatch.getCode() 00492 , excToCatch.getMessage() 00493 ); 00494 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00495 emitError 00496 ( 00497 XMLErrs::XMLException_Fatal 00498 , excToCatch.getCode() 00499 , excToCatch.getMessage() 00500 ); 00501 else 00502 emitError 00503 ( 00504 XMLErrs::XMLException_Error 00505 , excToCatch.getCode() 00506 , excToCatch.getMessage() 00507 ); 00508 return false; 00509 } 00510 00511 Janitor<InputSource> janSrc(srcToUse); 00512 return scanFirst(*srcToUse, toFill); 00513 } 00514 00515 bool XMLScanner::scanFirst( const char* const systemId 00516 , XMLPScanToken& toFill) 00517 { 00518 // We just delegate this to the XMLCh version after transcoding 00519 XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager); 00520 ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); 00521 return scanFirst(tmpBuf, toFill); 00522 } 00523 00524 bool XMLScanner::scanFirst( const InputSource& src 00525 , XMLPScanToken& toFill) 00526 { 00527 // Bump up the sequence id for this new scan cycle. This will invalidate 00528 // any previous tokens we've returned. 00529 fSequenceId++; 00530 00531 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 00532 00533 // Reset the scanner and its plugged in stuff for a new run. This 00534 // resets all the data structures, creates the initial reader and 00535 // pushes it on the stack, and sets up the base document path 00536 scanReset(src); 00537 00538 // If we have a document handler, then call the start document 00539 if (fDocHandler) 00540 fDocHandler->startDocument(); 00541 00542 try 00543 { 00544 // Scan the prolog part, which is everything before the root element 00545 // including the DTD subsets. This is all that is done on the scan 00546 // first. 00547 scanProlog(); 00548 00549 // If we got to the end of input, then its not a valid XML file. 00550 // Else, go on to scan the content. 00551 if (fReaderMgr.atEOF()) 00552 { 00553 emitError(XMLErrs::EmptyMainEntity); 00554 } 00555 } 00556 // NOTE: 00557 // 00558 // In all of the error processing below, the emitError() call MUST come 00559 // before the flush of the reader mgr, or it will fail because it tries 00560 // to find out the position in the XML source of the error. 00561 catch(const XMLErrs::Codes) 00562 { 00563 // This is a 'first failure' exception so return failure 00564 return false; 00565 } 00566 catch(const XMLValid::Codes) 00567 { 00568 // This is a 'first fatal error' type exit, return failure 00569 return false; 00570 } 00571 catch(const XMLException& excToCatch) 00572 { 00573 // Emit the error and catch any user exception thrown from here. Make 00574 // sure in all cases we flush the reader manager. 00575 fInException = true; 00576 try 00577 { 00578 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00579 emitError 00580 ( 00581 XMLErrs::XMLException_Warning 00582 , excToCatch.getCode() 00583 , excToCatch.getMessage() 00584 ); 00585 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00586 emitError 00587 ( 00588 XMLErrs::XMLException_Fatal 00589 , excToCatch.getCode() 00590 , excToCatch.getMessage() 00591 ); 00592 else 00593 emitError 00594 ( 00595 XMLErrs::XMLException_Error 00596 , excToCatch.getCode() 00597 , excToCatch.getMessage() 00598 ); 00599 } 00600 catch(const OutOfMemoryException&) 00601 { 00602 // This is a special case for out-of-memory 00603 // conditions, because resetting the ReaderMgr 00604 // can be problematic. 00605 resetReaderMgr.release(); 00606 00607 throw; 00608 } 00609 00610 return false; 00611 } 00612 catch(const OutOfMemoryException&) 00613 { 00614 // This is a special case for out-of-memory 00615 // conditions, because resetting the ReaderMgr 00616 // can be problematic. 00617 resetReaderMgr.release(); 00618 00619 throw; 00620 } 00621 00622 // Fill in the caller's token to make it legal and return success 00623 toFill.set(fScannerId, fSequenceId); 00624 00625 // Release the object that will reset the ReaderMgr, since there's 00626 // more to scan. 00627 resetReaderMgr.release(); 00628 00629 return true; 00630 } 00631 00632 00633 void XMLScanner::scanReset(XMLPScanToken& token) 00634 { 00635 // Make sure this token is still legal 00636 if (!isLegalToken(token)) 00637 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); 00638 00639 // Reset the reader manager 00640 fReaderMgr.reset(); 00641 00642 // And invalidate any tokens by bumping our sequence number 00643 fSequenceId++; 00644 00645 // Reset our error count 00646 fErrorCount = 0; 00647 } 00648 00649 void XMLScanner::setParseSettings(XMLScanner* const refScanner) 00650 { 00651 setDocHandler(refScanner->getDocHandler()); 00652 setDocTypeHandler(refScanner->getDocTypeHandler()); 00653 setErrorHandler(refScanner->getErrorHandler()); 00654 setErrorReporter(refScanner->getErrorReporter()); 00655 setEntityHandler(refScanner->getEntityHandler()); 00656 setDoNamespaces(refScanner->getDoNamespaces()); 00657 setDoSchema(refScanner->getDoSchema()); 00658 setCalculateSrcOfs(refScanner->getCalculateSrcOfs()); 00659 setStandardUriConformant(refScanner->getStandardUriConformant()); 00660 setExitOnFirstFatal(refScanner->getExitOnFirstFatal()); 00661 setValidationConstraintFatal(refScanner->getValidationConstraintFatal()); 00662 setIdentityConstraintChecking(refScanner->getIdentityConstraintChecking()); 00663 setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking()); 00664 cacheGrammarFromParse(refScanner->isCachingGrammarFromParse()); 00665 useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse()); 00666 setLoadExternalDTD(refScanner->getLoadExternalDTD()); 00667 setLoadSchema(refScanner->getLoadSchema()); 00668 setNormalizeData(refScanner->getNormalizeData()); 00669 setExternalSchemaLocation(refScanner->getExternalSchemaLocation()); 00670 setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation()); 00671 setValidationScheme(refScanner->getValidationScheme()); 00672 setSecurityManager(refScanner->getSecurityManager()); 00673 setPSVIHandler(refScanner->getPSVIHandler()); 00674 } 00675 00676 // --------------------------------------------------------------------------- 00677 // XMLScanner: Private helper methods. 00678 // --------------------------------------------------------------------------- 00679 00680 // This method handles the common initialization, to avoid having to do 00681 // it redundantly in multiple constructors. 00682 void XMLScanner::commonInit() 00683 { 00684 // We have to do a little init that involves statics, so we have to 00685 // use the mutex to protect it. 00686 { 00687 XMLMutexLock lockInit(sScannerMutex); 00688 00689 // And assign ourselves the next available scanner id 00690 fScannerId = ++gScannerId; 00691 } 00692 00693 // Create the attribute list, which is used to store attribute values 00694 // during start tag processing. Give it a reasonable initial size that 00695 // will serve for most folks, though it will grow as required. 00696 fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager); 00697 00698 // Create the id ref list. This is used to enforce XML 1.0 ID ref 00699 // semantics, i.e. all id refs must refer to elements that exist 00700 fValidationContext = new (fMemoryManager) ValidationContextImpl(fMemoryManager); 00701 fValidationContext->setElemStack(&fElemStack); 00702 fValidationContext->setScanner(this); 00703 00704 // Create the GrammarResolver 00705 //fGrammarResolver = new GrammarResolver(); 00706 00707 // create initial, 64-element, fUIntPool 00708 fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) *fUIntPoolRowTotal); 00709 memset(fUIntPool, 0, sizeof(unsigned int *) * fUIntPoolRowTotal); 00710 fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6); 00711 memset(fUIntPool[0], 0, sizeof(unsigned int) << 6); 00712 00713 // Register self as handler for XMLBufferFull events on the CDATA buffer 00714 fCDataBuf.setFullHandler(this, fBufferSize); 00715 00716 if (fValidator) { 00717 fValidatorFromUser = true; 00718 initValidator(fValidator); 00719 } 00720 } 00721 00722 void XMLScanner::cleanUp() 00723 { 00724 delete fAttrList; 00725 delete fAttrDupChkRegistry; 00726 delete fValidationContext; 00727 fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName; 00728 fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation; 00729 fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation; 00730 // delete fUIntPool 00731 if (fUIntPool) 00732 { 00733 for (unsigned int i=0; i<=fUIntPoolRow; i++) 00734 { 00735 fMemoryManager->deallocate(fUIntPool[i]); 00736 } 00737 fMemoryManager->deallocate(fUIntPool); 00738 } 00739 } 00740 00741 void XMLScanner::initValidator(XMLValidator* theValidator) { 00742 00743 // Tell the validator about the stuff it needs to know in order to 00744 // do its work. 00745 theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr); 00746 theValidator->setErrorReporter(fErrorReporter); 00747 } 00748 00749 // --------------------------------------------------------------------------- 00750 // XMLScanner: Error emitting methods 00751 // --------------------------------------------------------------------------- 00752 00753 // These methods are called whenever the scanner wants to emit an error. 00754 // It handles getting the message loaded, doing token replacement, etc... 00755 // and then calling the error handler, if its installed. 00756 bool XMLScanner::emitErrorWillThrowException(const XMLErrs::Codes toEmit) 00757 { 00758 if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException) 00759 return true; 00760 return false; 00761 } 00762 00763 void XMLScanner::emitError(const XMLErrs::Codes toEmit) 00764 { 00765 // Bump the error count if it is not a warning 00766 if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning) 00767 incrementErrorCount(); 00768 00769 if (fErrorReporter) 00770 { 00771 // Load the message into a local for display 00772 const XMLSize_t msgSize = 1023; 00773 XMLCh errText[msgSize + 1]; 00774 00775 if (!gMsgLoader->loadMsg(toEmit, errText, msgSize)) 00776 { 00777 // <TBD> Probably should load a default msg here 00778 } 00779 00780 // Create a LastExtEntityInfo structure and get the reader manager 00781 // to fill it in for us. This will give us the information about 00782 // the last reader on the stack that was an external entity of some 00783 // sort (i.e. it will ignore internal entities. 00784 ReaderMgr::LastExtEntityInfo lastInfo; 00785 fReaderMgr.getLastExtEntityInfo(lastInfo); 00786 00787 fErrorReporter->error 00788 ( 00789 toEmit 00790 , XMLUni::fgXMLErrDomain 00791 , XMLErrs::errorType(toEmit) 00792 , errText 00793 , lastInfo.systemId 00794 , lastInfo.publicId 00795 , lastInfo.lineNumber 00796 , lastInfo.colNumber 00797 ); 00798 } 00799 00800 // Bail out if its fatal an we are to give up on the first fatal error 00801 if (emitErrorWillThrowException(toEmit)) 00802 throw toEmit; 00803 } 00804 00805 void XMLScanner::emitError( const XMLErrs::Codes toEmit 00806 , const XMLCh* const text1 00807 , const XMLCh* const text2 00808 , const XMLCh* const text3 00809 , const XMLCh* const text4) 00810 { 00811 // Bump the error count if it is not a warning 00812 if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning) 00813 incrementErrorCount(); 00814 00815 if (fErrorReporter) 00816 { 00817 // Load the message into alocal and replace any tokens found in 00818 // the text. 00819 const XMLSize_t maxChars = 2047; 00820 XMLCh errText[maxChars + 1]; 00821 00822 if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager)) 00823 { 00824 // <TBD> Should probably load a default message here 00825 } 00826 00827 // Create a LastExtEntityInfo structure and get the reader manager 00828 // to fill it in for us. This will give us the information about 00829 // the last reader on the stack that was an external entity of some 00830 // sort (i.e. it will ignore internal entities. 00831 ReaderMgr::LastExtEntityInfo lastInfo; 00832 fReaderMgr.getLastExtEntityInfo(lastInfo); 00833 00834 fErrorReporter->error 00835 ( 00836 toEmit 00837 , XMLUni::fgXMLErrDomain 00838 , XMLErrs::errorType(toEmit) 00839 , errText 00840 , lastInfo.systemId 00841 , lastInfo.publicId 00842 , lastInfo.lineNumber 00843 , lastInfo.colNumber 00844 ); 00845 } 00846 00847 // Bail out if its fatal an we are to give up on the first fatal error 00848 if (emitErrorWillThrowException(toEmit)) 00849 throw toEmit; 00850 } 00851 00852 void XMLScanner::emitError( const XMLErrs::Codes toEmit 00853 , const char* const text1 00854 , const char* const text2 00855 , const char* const text3 00856 , const char* const text4) 00857 { 00858 // Bump the error count if it is not a warning 00859 if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning) 00860 incrementErrorCount(); 00861 00862 if (fErrorReporter) 00863 { 00864 // Load the message into alocal and replace any tokens found in 00865 // the text. 00866 const XMLSize_t maxChars = 2047; 00867 XMLCh errText[maxChars + 1]; 00868 00869 if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager)) 00870 { 00871 // <TBD> Should probably load a default message here 00872 } 00873 00874 // Create a LastExtEntityInfo structure and get the reader manager 00875 // to fill it in for us. This will give us the information about 00876 // the last reader on the stack that was an external entity of some 00877 // sort (i.e. it will ignore internal entities. 00878 ReaderMgr::LastExtEntityInfo lastInfo; 00879 fReaderMgr.getLastExtEntityInfo(lastInfo); 00880 00881 fErrorReporter->error 00882 ( 00883 toEmit 00884 , XMLUni::fgXMLErrDomain 00885 , XMLErrs::errorType(toEmit) 00886 , errText 00887 , lastInfo.systemId 00888 , lastInfo.publicId 00889 , lastInfo.lineNumber 00890 , lastInfo.colNumber 00891 ); 00892 } 00893 00894 // Bail out if its fatal an we are to give up on the first fatal error 00895 if (emitErrorWillThrowException(toEmit)) 00896 throw toEmit; 00897 } 00898 00899 void XMLScanner::emitError( const XMLErrs::Codes toEmit 00900 , const XMLExcepts::Codes originalExceptCode 00901 , const XMLCh* const text1 00902 , const XMLCh* const text2 00903 , const XMLCh* const text3 00904 , const XMLCh* const text4) 00905 { 00906 // Bump the error count if it is not a warning 00907 if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning) 00908 incrementErrorCount(); 00909 00910 if (fErrorReporter) 00911 { 00912 // Load the message into alocal and replace any tokens found in 00913 // the text. 00914 const XMLSize_t maxChars = 2047; 00915 XMLCh errText[maxChars + 1]; 00916 00917 if (!gMsgLoader->loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager)) 00918 { 00919 // <TBD> Should probably load a default message here 00920 } 00921 00922 // Create a LastExtEntityInfo structure and get the reader manager 00923 // to fill it in for us. This will give us the information about 00924 // the last reader on the stack that was an external entity of some 00925 // sort (i.e. it will ignore internal entities. 00926 ReaderMgr::LastExtEntityInfo lastInfo; 00927 fReaderMgr.getLastExtEntityInfo(lastInfo); 00928 00929 fErrorReporter->error 00930 ( 00931 originalExceptCode 00932 , XMLUni::fgExceptDomain //fgXMLErrDomain 00933 , XMLErrs::errorType(toEmit) 00934 , errText 00935 , lastInfo.systemId 00936 , lastInfo.publicId 00937 , lastInfo.lineNumber 00938 , lastInfo.colNumber 00939 ); 00940 } 00941 00942 // Bail out if its fatal an we are to give up on the first fatal error 00943 if (emitErrorWillThrowException(toEmit)) 00944 throw toEmit; 00945 } 00946 00947 // --------------------------------------------------------------------------- 00948 // XMLScanner: Getter methods 00949 // --------------------------------------------------------------------------- 00950 00951 // This method allows the caller to query the current location of the scanner. 00952 // It will return the sys/public ids of the current entity, and the line/col 00953 // position within it. 00954 // 00955 // NOTE: This API returns the location with the last external file. So if its 00956 // currently scanning an entity, the position returned will be the end of 00957 // the entity reference in the file that had the reference. 00958 // 00959 /*bool 00960 XMLScanner::getLastExtLocation( XMLCh* const sysIdToFill 00961 , const unsigned int maxSysIdChars 00962 , XMLCh* const pubIdToFill 00963 , const unsigned int maxPubIdChars 00964 , XMLSSize_t& lineToFill 00965 , XMLSSize_t& colToFill) const 00966 { 00967 // Create a local info object and get it filled in by the reader manager 00968 ReaderMgr::LastExtEntityInfo lastInfo; 00969 fReaderMgr.getLastExtEntityInfo(lastInfo); 00970 00971 // Fill in the line and column number 00972 lineToFill = lastInfo.lineNumber; 00973 colToFill = lastInfo.colNumber; 00974 00975 // And copy over as much of the ids as will fit 00976 sysIdToFill[0] = 0; 00977 if (lastInfo.systemId) 00978 { 00979 if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars) 00980 return false; 00981 XMLString::copyString(sysIdToFill, lastInfo.systemId); 00982 } 00983 00984 pubIdToFill[0] = 0; 00985 if (lastInfo.publicId) 00986 { 00987 if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars) 00988 return false; 00989 XMLString::copyString(pubIdToFill, lastInfo.publicId); 00990 } 00991 return true; 00992 }*/ 00993 00994 00995 // --------------------------------------------------------------------------- 00996 // XMLScanner: Private scanning methods 00997 // --------------------------------------------------------------------------- 00998 00999 // This method is called after the end of the root element, to handle 01000 // any miscellaneous stuff hanging around. 01001 void XMLScanner::scanMiscellaneous() 01002 { 01003 // Get a buffer for this work 01004 XMLBufBid bbCData(&fBufMgr); 01005 01006 while (true) 01007 { 01008 try 01009 { 01010 const XMLCh nextCh = fReaderMgr.peekNextChar(); 01011 01012 // Watch for end of file and break out 01013 if (!nextCh) 01014 break; 01015 01016 if (nextCh == chOpenAngle) 01017 { 01018 if (checkXMLDecl(true)) 01019 { 01020 // Can't have an XML decl here 01021 emitError(XMLErrs::NotValidAfterContent); 01022 fReaderMgr.skipPastChar(chCloseAngle); 01023 } 01024 else if (fReaderMgr.skippedString(XMLUni::fgPIString)) 01025 { 01026 scanPI(); 01027 } 01028 else if (fReaderMgr.skippedString(XMLUni::fgCommentString)) 01029 { 01030 scanComment(); 01031 } 01032 else 01033 { 01034 // This can't be possible, so just give up 01035 emitError(XMLErrs::ExpectedCommentOrPI); 01036 fReaderMgr.skipPastChar(chCloseAngle); 01037 } 01038 } 01039 else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 01040 { 01041 // If we have a doc handler, then gather up the spaces and 01042 // call back. Otherwise, just skip over whitespace. 01043 if (fDocHandler) 01044 { 01045 fReaderMgr.getSpaces(bbCData.getBuffer()); 01046 fDocHandler->ignorableWhitespace 01047 ( 01048 bbCData.getRawBuffer() 01049 , bbCData.getLen() 01050 , false 01051 ); 01052 } 01053 else 01054 { 01055 fReaderMgr.skipPastSpaces(); 01056 } 01057 } 01058 else 01059 { 01060 emitError(XMLErrs::ExpectedCommentOrPI); 01061 fReaderMgr.skipPastChar(chCloseAngle); 01062 } 01063 } 01064 catch(const EndOfEntityException&) 01065 { 01066 // Some entity leaked out of the content part of the document. Issue 01067 // a warning and keep going. 01068 emitError(XMLErrs::EntityPropogated); 01069 } 01070 } 01071 } 01072 01073 01074 // Scans a PI and calls the appropriate callbacks. At entry we have just 01075 // scanned the <? part, and need to now start on the PI target name. 01076 void XMLScanner::scanPI() 01077 { 01078 const XMLCh* namePtr = 0; 01079 const XMLCh* targetPtr = 0; 01080 01081 // If there are any spaces here, then warn about it. If we aren't in 01082 // 'first error' mode, then we'll come back and can easily pick up 01083 // again by just skipping them. 01084 if (fReaderMgr.lookingAtSpace()) 01085 { 01086 emitError(XMLErrs::PINameExpected); 01087 fReaderMgr.skipPastSpaces(); 01088 } 01089 01090 // Get a buffer for the PI name and scan it in 01091 XMLBufBid bbName(&fBufMgr); 01092 if (!fReaderMgr.getName(bbName.getBuffer())) 01093 { 01094 emitError(XMLErrs::PINameExpected); 01095 fReaderMgr.skipPastChar(chCloseAngle); 01096 return; 01097 } 01098 01099 // Point the name pointer at the raw data 01100 namePtr = bbName.getRawBuffer(); 01101 01102 // See if it is some form of 'xml' and emit a warning 01103 //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString)) 01104 if (bbName.getLen() == 3 && 01105 (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) && 01106 ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) && 01107 ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L)))) 01108 emitError(XMLErrs::NoPIStartsWithXML); 01109 01110 // If namespaces are enabled, then no colons allowed 01111 if (fDoNamespaces) 01112 { 01113 if (XMLString::indexOf(namePtr, chColon) != -1) 01114 emitError(XMLErrs::ColonNotLegalWithNS); 01115 } 01116 01117 // If we don't hit a space next, then the PI has no target. If we do 01118 // then get out the target. Get a buffer for it as well 01119 XMLBufBid bbTarget(&fBufMgr); 01120 if (fReaderMgr.skippedSpace()) 01121 { 01122 // Skip any leading spaces 01123 fReaderMgr.skipPastSpaces(); 01124 01125 bool gotLeadingSurrogate = false; 01126 01127 // It does have a target, so lets move on to deal with that. 01128 while (1) 01129 { 01130 const XMLCh nextCh = fReaderMgr.getNextChar(); 01131 01132 // Watch for an end of file, which is always bad here 01133 if (!nextCh) 01134 { 01135 emitError(XMLErrs::UnterminatedPI); 01136 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01137 } 01138 01139 // Watch for potential terminating character 01140 if (nextCh == chQuestion) 01141 { 01142 // It must be followed by '>' to be a termination of the target 01143 if (fReaderMgr.skippedChar(chCloseAngle)) 01144 break; 01145 } 01146 01147 // Check for correct surrogate pairs 01148 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 01149 { 01150 if (gotLeadingSurrogate) 01151 emitError(XMLErrs::Expected2ndSurrogateChar); 01152 else 01153 gotLeadingSurrogate = true; 01154 } 01155 else 01156 { 01157 if (gotLeadingSurrogate) 01158 { 01159 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 01160 emitError(XMLErrs::Expected2ndSurrogateChar); 01161 } 01162 // Its got to at least be a valid XML character 01163 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { 01164 01165 XMLCh tmpBuf[9]; 01166 XMLString::binToText 01167 ( 01168 nextCh 01169 , tmpBuf 01170 , 8 01171 , 16 01172 , fMemoryManager 01173 ); 01174 emitError(XMLErrs::InvalidCharacter, tmpBuf); 01175 } 01176 01177 gotLeadingSurrogate = false; 01178 } 01179 01180 bbTarget.append(nextCh); 01181 } 01182 } 01183 else 01184 { 01185 // No target, but make sure its terminated ok 01186 if (!fReaderMgr.skippedChar(chQuestion)) 01187 { 01188 emitError(XMLErrs::UnterminatedPI); 01189 fReaderMgr.skipPastChar(chCloseAngle); 01190 return; 01191 } 01192 01193 if (!fReaderMgr.skippedChar(chCloseAngle)) 01194 { 01195 emitError(XMLErrs::UnterminatedPI); 01196 fReaderMgr.skipPastChar(chCloseAngle); 01197 return; 01198 } 01199 } 01200 01201 // Point the target pointer at the raw data 01202 targetPtr = bbTarget.getRawBuffer(); 01203 01204 // If we have a handler, then call it 01205 if (fDocHandler) 01206 { 01207 fDocHandler->docPI 01208 ( 01209 namePtr 01210 , targetPtr 01211 ); 01212 } 01213 01214 //mark PI is seen within the current element 01215 if (! fElemStack.isEmpty()) 01216 fElemStack.setCommentOrPISeen(); 01217 01218 } 01219 01220 // Scans all the input from the start of the file to the root element. 01221 // There does not have to be anything in the prolog necessarily, but usually 01222 // there is at least an XMLDecl. 01223 // 01224 // On exit from here we are either at the end of the file or about to read 01225 // the opening < of the root element. 01226 void XMLScanner::scanProlog() 01227 { 01228 bool sawDocTypeDecl = false; 01229 // Get a buffer for whitespace processing 01230 XMLBufBid bbCData(&fBufMgr); 01231 01232 // Loop through the prolog. If there is no content, this could go all 01233 // the way to the end of the file. 01234 try 01235 { 01236 while (true) 01237 { 01238 const XMLCh nextCh = fReaderMgr.peekNextChar(); 01239 01240 if (nextCh == chOpenAngle) 01241 { 01242 // Ok, it could be the xml decl, a comment, the doc type line, 01243 // or the start of the root element. 01244 if (checkXMLDecl(true)) 01245 { 01246 // There shall be at lease --ONE-- space in between 01247 // the tag '<?xml' and the VersionInfo. 01248 // 01249 // If we are not at line 1, col 6, then the decl was not 01250 // the first text, so its invalid. 01251 const XMLReader* curReader = fReaderMgr.getCurrentReader(); 01252 if ((curReader->getLineNumber() != 1) 01253 || (curReader->getColumnNumber() != 7)) 01254 { 01255 emitError(XMLErrs::XMLDeclMustBeFirst); 01256 } 01257 01258 scanXMLDecl(Decl_XML); 01259 } 01260 else if (fReaderMgr.skippedString(XMLUni::fgPIString)) 01261 { 01262 scanPI(); 01263 } 01264 else if (fReaderMgr.skippedString(XMLUni::fgCommentString)) 01265 { 01266 scanComment(); 01267 } 01268 else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString)) 01269 { 01270 if (sawDocTypeDecl) { 01271 emitError(XMLErrs::DuplicateDocTypeDecl); 01272 } 01273 scanDocTypeDecl(); 01274 sawDocTypeDecl = true; 01275 01276 // if reusing grammar, this has been validated already in first scan 01277 // skip for performance 01278 if (fValidate && fGrammar && !fGrammar->getValidated()) { 01279 // validate the DTD scan so far 01280 fValidator->preContentValidation(fUseCachedGrammar, true); 01281 } 01282 } 01283 else 01284 { 01285 // Assume its the start of the root element 01286 return; 01287 } 01288 } 01289 else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 01290 { 01291 // If we have a document handler then gather up the 01292 // whitespace and call back. Otherwise just skip over spaces. 01293 if (fDocHandler) 01294 { 01295 fReaderMgr.getSpaces(bbCData.getBuffer()); 01296 fDocHandler->ignorableWhitespace 01297 ( 01298 bbCData.getRawBuffer() 01299 , bbCData.getLen() 01300 , false 01301 ); 01302 } 01303 else 01304 { 01305 fReaderMgr.skipPastSpaces(); 01306 } 01307 } 01308 else 01309 { 01310 emitError(XMLErrs::InvalidDocumentStructure); 01311 01312 // Watch for end of file and break out 01313 if (!nextCh) 01314 break; 01315 else 01316 fReaderMgr.skipPastChar(chCloseAngle); 01317 } 01318 01319 } 01320 } 01321 catch(const EndOfEntityException&) 01322 { 01323 // We should never get an end of entity here. They should only 01324 // occur within the doc type scanning method, and not leak out to 01325 // here. 01326 emitError 01327 ( 01328 XMLErrs::UnexpectedEOE 01329 , "in prolog" 01330 ); 01331 } 01332 } 01333 01334 01335 // Scans the <?xml .... ?> line. This stuff is all sequential so we don't 01336 // do any state machine loop here. We just bull straight through it. It ends 01337 // past the closing bracket. If there is a document handler, then its called 01338 // on the XMLDecl callback. 01339 // 01340 // On entry, the <?xml has been scanned, and we pick it up from there. 01341 // 01342 // NOTE: In order to provide good recovery from bad XML here, we try to be 01343 // very flexible. No matter what order the stuff is in, we'll keep going 01344 // though we'll issue errors. 01345 // 01346 // The parameter tells us which type of decl we should expect, Text or XML. 01347 // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 01348 // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>' 01349 void XMLScanner::scanXMLDecl(const DeclTypes type) 01350 { 01351 // Get us some buffers to use 01352 XMLBufBid bbVersion(&fBufMgr); 01353 XMLBufBid bbEncoding(&fBufMgr); 01354 XMLBufBid bbStand(&fBufMgr); 01355 XMLBufBid bbDummy(&fBufMgr); 01356 XMLBufBid bbName(&fBufMgr); 01357 01358 // We use this little enum and array to keep up with what we found 01359 // and what order we found them in. This lets us get them free form 01360 // without too much overhead, but still know that they were in the 01361 // wrong order. 01362 enum Strings 01363 { 01364 VersionString 01365 , EncodingString 01366 , StandaloneString 01367 , UnknownString 01368 01369 , StringCount 01370 }; 01371 int flags[StringCount] = { -1, -1, -1, -1 }; 01372 01373 // Also set up a list of buffers in the right order so that we know 01374 // where to put stuff. 01375 XMLBuffer* buffers[StringCount] ; 01376 buffers[0] = &bbVersion.getBuffer(); 01377 buffers[1] = &bbEncoding.getBuffer(); 01378 buffers[2] = &bbStand.getBuffer(); 01379 buffers[3] = &bbDummy.getBuffer(); 01380 01381 int curCount = 0; 01382 Strings curString; 01383 XMLBuffer& nameBuf = bbName.getBuffer(); 01384 while (true) 01385 { 01386 // Skip any spaces 01387 bool skippedSomething; 01388 fReaderMgr.skipPastSpaces(skippedSomething, true); 01389 01390 // If we are looking at a question mark, then break out 01391 if (fReaderMgr.lookingAtChar(chQuestion)) 01392 break; 01393 01394 // If this is not the first string, then we require the spaces 01395 if (!skippedSomething && curCount) 01396 emitError(XMLErrs::ExpectedWhitespace); 01397 01398 // Get characters up to the next whitespace or equal's sign. 01399 if (!scanUpToWSOr(nameBuf, chEqual)) 01400 emitError(XMLErrs::ExpectedDeclString); 01401 01402 // See if it matches any of our expected strings 01403 if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString)) 01404 curString = VersionString; 01405 else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString)) 01406 curString = EncodingString; 01407 else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString)) 01408 curString = StandaloneString; 01409 else 01410 curString = UnknownString; 01411 01412 // If its an unknown string, then give that error. Else check to 01413 // see if this one has been done already and give that error. 01414 if (curString == UnknownString) 01415 emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer()); 01416 else if (flags[curString] != -1) 01417 emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer()); 01418 else if (flags[curString] == -1) 01419 flags[curString] = ++curCount; 01420 01421 // Scan for an equal's sign. If we don't find it, issue an error 01422 // but keep trying to go on. 01423 if (!scanEq(true)) 01424 emitError(XMLErrs::ExpectedEqSign); 01425 01426 // Get a quote string into the buffer for the string that we are 01427 // currently working on. 01428 if (!getQuotedString(*buffers[curString])) 01429 { 01430 emitError(XMLErrs::ExpectedQuotedString); 01431 fReaderMgr.skipPastChar(chCloseAngle); 01432 return; 01433 } 01434 01435 // And validate the value according which one it was 01436 const XMLCh* rawValue = buffers[curString]->getRawBuffer(); 01437 if (curString == VersionString) 01438 { 01439 if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) { 01440 if (type == Decl_XML) { 01441 fXMLVersion = XMLReader::XMLV1_1; 01442 fReaderMgr.setXMLVersion(XMLReader::XMLV1_1); 01443 } 01444 else { 01445 if (fXMLVersion != XMLReader::XMLV1_1) 01446 emitError(XMLErrs::UnsupportedXMLVersion, rawValue); 01447 } 01448 } 01449 else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) { 01450 if (type == Decl_XML) { 01451 fXMLVersion = XMLReader::XMLV1_0; 01452 fReaderMgr.setXMLVersion(XMLReader::XMLV1_0); 01453 } 01454 } 01455 else 01456 emitError(XMLErrs::UnsupportedXMLVersion, rawValue); 01457 } 01458 else if (curString == EncodingString) 01459 { 01460 if (!XMLString::isValidEncName(rawValue)) 01461 emitError(XMLErrs::BadXMLEncoding, rawValue); 01462 } 01463 else if (curString == StandaloneString) 01464 { 01465 if (XMLString::equals(rawValue, XMLUni::fgYesString)) 01466 fStandalone = true; 01467 else if (XMLString::equals(rawValue, XMLUni::fgNoString)) 01468 fStandalone = false; 01469 else 01470 { 01471 emitError(XMLErrs::BadStandalone); 01472 //if (!XMLString::compareIString(rawValue, XMLUni::fgYesString)) 01473 //else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString)) 01474 if (buffers[curString]->getLen() == 3 && 01475 (((rawValue[0] == chLatin_y) || (rawValue[0] == chLatin_Y)) && 01476 ((rawValue[1] == chLatin_e) || (rawValue[1] == chLatin_E)) && 01477 ((rawValue[2] == chLatin_s) || (rawValue[2] == chLatin_S)))) 01478 fStandalone = true; 01479 else if (buffers[curString]->getLen() == 2 && 01480 (((rawValue[0] == chLatin_n) || (rawValue[0] == chLatin_N)) && 01481 ((rawValue[1] == chLatin_o) || (rawValue[1] == chLatin_O)))) 01482 fStandalone = false; 01483 } 01484 } 01485 } 01486 01487 // Make sure that the strings present are in order. We don't care about 01488 // which ones are present at this point, just that any there are in the 01489 // right order. 01490 int curTop = 0; 01491 for (int index = VersionString; index < StandaloneString; index++) 01492 { 01493 if (flags[index] != -1) 01494 { 01495 if (flags[index] != curTop + 1) 01496 { 01497 emitError(XMLErrs::DeclStringsInWrongOrder); 01498 break; 01499 } 01500 curTop = flags[index]; 01501 } 01502 } 01503 01504 // If its an XML decl, the version must be present. 01505 // If its a Text decl, then encoding must be present AND standalone must not be present. 01506 if ((type == Decl_XML) && (flags[VersionString] == -1)) 01507 emitError(XMLErrs::XMLVersionRequired); 01508 else if (type == Decl_Text) { 01509 if (flags[StandaloneString] != -1) 01510 emitError(XMLErrs::StandaloneNotLegal); 01511 if (flags[EncodingString] == -1) 01512 emitError(XMLErrs::EncodingRequired); 01513 } 01514 01515 if (!fReaderMgr.skippedChar(chQuestion)) 01516 { 01517 emitError(XMLErrs::UnterminatedXMLDecl); 01518 fReaderMgr.skipPastChar(chCloseAngle); 01519 } 01520 else if (!fReaderMgr.skippedChar(chCloseAngle)) 01521 { 01522 emitError(XMLErrs::UnterminatedXMLDecl); 01523 fReaderMgr.skipPastChar(chCloseAngle); 01524 } 01525 01526 // Do this before we possibly update the reader with the 01527 // actual encoding string. Otherwise, we will pass the wrong thing 01528 // for the last parameter! 01529 const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr(); 01530 01531 // Ok, we've now seen the real encoding string, if there was one, so 01532 // lets call back on the current reader and tell it what the real 01533 // encoding string was. If it fails, that's because it represents some 01534 // sort of contradiction with the autosensed format, and it keeps the 01535 // original encoding. 01536 // 01537 // NOTE: This can fail for a number of reasons, such as a bogus encoding 01538 // name or because its in flagrant contradiction of the auto-sensed 01539 // format. 01540 if (flags[EncodingString] != -1) 01541 { 01542 if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer())) 01543 emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer()); 01544 else 01545 actualEnc = bbEncoding.getRawBuffer(); 01546 } 01547 01548 // If we have a document handler then call the XML Decl callback. 01549 if (type == Decl_XML) 01550 { 01551 if (fDocHandler) 01552 fDocHandler->XMLDecl 01553 ( 01554 bbVersion.getRawBuffer() 01555 , bbEncoding.getRawBuffer() 01556 , bbStand.getRawBuffer() 01557 , actualEnc 01558 ); 01559 } 01560 else if (type == Decl_Text) 01561 { 01562 if (fDocTypeHandler) 01563 fDocTypeHandler->TextDecl 01564 ( 01565 bbVersion.getRawBuffer() 01566 , bbEncoding.getRawBuffer() 01567 ); 01568 } 01569 } 01570 01571 const XMLCh* XMLScanner::getURIText(const unsigned int uriId) const 01572 { 01573 if (fURIStringPool->exists(uriId)) { 01574 // Look up the URI in the string pool and return its id 01575 const XMLCh* value = fURIStringPool->getValueForId(uriId); 01576 if (!value) 01577 return XMLUni::fgZeroLenString; 01578 01579 return value; 01580 } 01581 else 01582 return XMLUni::fgZeroLenString; 01583 } 01584 01585 bool XMLScanner::getURIText( const unsigned int uriId 01586 , XMLBuffer& uriBufToFill) const 01587 { 01588 if (fURIStringPool->exists(uriId)) { 01589 // Look up the URI in the string pool and return its id 01590 const XMLCh* value = fURIStringPool->getValueForId(uriId); 01591 if (!value) 01592 return false; 01593 01594 uriBufToFill.set(value); 01595 return true; 01596 } 01597 else 01598 return false; 01599 } 01600 01601 bool XMLScanner::checkXMLDecl(bool startWithAngle) { 01602 01603 // [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 01604 // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') 01605 // 01606 // [3] S ::= (#x20 | #x9 | #xD | #xA)+ 01607 if (startWithAngle) { 01608 if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) { 01609 if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace) 01610 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab) 01611 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF) 01612 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR)) 01613 { 01614 return true; 01615 } 01616 } 01617 else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU) 01618 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU) 01619 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU) 01620 || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU)) 01621 { 01622 // Just in case, check for upper case. If found, issue 01623 // an error, but keep going. 01624 emitError(XMLErrs::XMLDeclMustBeLowerCase); 01625 return true; 01626 } 01627 } 01628 else { 01629 if (fReaderMgr.peekString(XMLUni::fgXMLString)) { 01630 if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace) 01631 || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab) 01632 || fReaderMgr.skippedString(XMLUni::fgXMLStringLF) 01633 || fReaderMgr.skippedString(XMLUni::fgXMLStringCR)) 01634 { 01635 return true; 01636 } 01637 } 01638 else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU) 01639 || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU) 01640 || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU) 01641 || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU)) 01642 { 01643 // Just in case, check for upper case. If found, issue 01644 // an error, but keep going. 01645 emitError(XMLErrs::XMLDeclMustBeLowerCase); 01646 return true; 01647 } 01648 } 01649 01650 return false; 01651 } 01652 01653 01654 // --------------------------------------------------------------------------- 01655 // XMLScanner: Grammar preparsing 01656 // --------------------------------------------------------------------------- 01657 Grammar* XMLScanner::loadGrammar(const XMLCh* const systemId 01658 , const short grammarType 01659 , const bool toCache) 01660 { 01661 InputSource* srcToUse = 0; 01662 01663 if (fEntityHandler){ 01664 ReaderMgr::LastExtEntityInfo lastInfo; 01665 fReaderMgr.getLastExtEntityInfo(lastInfo); 01666 XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, 01667 systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId, 01668 &fReaderMgr); 01669 srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier); 01670 } 01671 01672 // First we try to parse it as a URL. If that fails, we assume its 01673 // a file and try it that way. 01674 if (!srcToUse) { 01675 if (fDisableDefaultEntityResolution) 01676 return 0; 01677 01678 try 01679 { 01680 // Create a temporary URL. Since this is the primary document, 01681 // it has to be fully qualified. If not, then assume we are just 01682 // mistaking a file for a URL. 01683 XMLURL tmpURL(fMemoryManager); 01684 01685 if (XMLURL::parse(systemId, tmpURL)) { 01686 01687 if (tmpURL.isRelative()) 01688 { 01689 if (!fStandardUriConformant) 01690 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 01691 else { 01692 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 01693 // emit the error directly 01694 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager); 01695 fInException = true; 01696 emitError 01697 ( 01698 XMLErrs::XMLException_Fatal 01699 , e.getCode() 01700 , e.getMessage() 01701 ); 01702 return 0; 01703 } 01704 } 01705 else 01706 { 01707 if (fStandardUriConformant && tmpURL.hasInvalidChar()) { 01708 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager); 01709 fInException = true; 01710 emitError 01711 ( 01712 XMLErrs::XMLException_Fatal 01713 , e.getCode() 01714 , e.getMessage() 01715 ); 01716 return 0; 01717 } 01718 srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager); 01719 } 01720 } 01721 else 01722 { 01723 if (!fStandardUriConformant) 01724 srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager); 01725 else { 01726 // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr 01727 // emit the error directly 01728 // lazy bypass ... since all MalformedURLException are fatal, no need to check the type 01729 MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL); 01730 fInException = true; 01731 emitError 01732 ( 01733 XMLErrs::XMLException_Fatal 01734 , e.getCode() 01735 , e.getMessage() 01736 ); 01737 return 0; 01738 } 01739 } 01740 } 01741 catch(const XMLException& excToCatch) 01742 { 01743 // For any other XMLException, 01744 // emit the error and catch any user exception thrown from here. 01745 fInException = true; 01746 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 01747 emitError 01748 ( 01749 XMLErrs::XMLException_Warning 01750 , excToCatch.getCode() 01751 , excToCatch.getMessage() 01752 ); 01753 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 01754 emitError 01755 ( 01756 XMLErrs::XMLException_Fatal 01757 , excToCatch.getCode() 01758 , excToCatch.getMessage() 01759 ); 01760 else 01761 emitError 01762 ( 01763 XMLErrs::XMLException_Error 01764 , excToCatch.getCode() 01765 , excToCatch.getMessage() 01766 ); 01767 return 0; 01768 } 01769 } 01770 01771 Janitor<InputSource> janSrc(srcToUse); 01772 return loadGrammar(*srcToUse, grammarType, toCache); 01773 } 01774 01775 Grammar* XMLScanner::loadGrammar(const char* const systemId 01776 , const short grammarType 01777 , const bool toCache) 01778 { 01779 // We just delegate this to the XMLCh version after transcoding 01780 XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager); 01781 ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); 01782 return loadGrammar(tmpBuf, grammarType, toCache); 01783 } 01784 01785 01786 // --------------------------------------------------------------------------- 01787 // XMLScanner: Setter methods 01788 // --------------------------------------------------------------------------- 01789 void XMLScanner::setURIStringPool(XMLStringPool* const stringPool) 01790 { 01791 fURIStringPool = stringPool; 01792 fEmptyNamespaceId = fURIStringPool->addOrFind(XMLUni::fgZeroLenString); 01793 fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName); 01794 fXMLNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLURIName); 01795 fXMLNSNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName); 01796 } 01797 01798 // --------------------------------------------------------------------------- 01799 // XMLScanner: Private helper methods 01800 // --------------------------------------------------------------------------- 01801 01802 /*** 01803 * In reusing grammars (cacheing grammar from parse, or use cached grammar), internal 01804 * dtd is allowed conditionally. 01805 * 01806 * In the case of cacheing grammar from parse, it is NOT allowed. 01807 * 01808 * In the case of use cached grammar, 01809 * if external dtd is present and it is parsed before, then it is not allowed, 01810 * otherwise it is allowed. 01811 * 01812 ***/ 01813 void XMLScanner::checkInternalDTD(bool hasExtSubset 01814 ,const XMLCh* const sysId 01815 ,const XMLCh* const pubId) 01816 { 01817 if (fToCacheGrammar) 01818 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager); 01819 01820 if (fUseCachedGrammar && hasExtSubset && !fIgnoreCachedDTD) 01821 { 01822 InputSource* sysIdSrc = resolveSystemId(sysId, pubId); 01823 if (sysIdSrc) { 01824 Janitor<InputSource> janSysIdSrc(sysIdSrc); 01825 Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId()); 01826 01827 if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) 01828 { 01829 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager); 01830 } 01831 } 01832 } 01833 01834 } 01835 01836 // This method is called after the content scan to insure that all the 01837 // ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is 01838 // an XML 1.0 rule, so we can do here in the core. 01839 01840 void XMLScanner::checkIDRefs() 01841 { 01842 // Iterate the id ref list. If we find any entries here which are used 01843 // but not declared, then that's an error. 01844 RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager); 01845 while (refEnum.hasMoreElements()) 01846 { 01847 // Get a ref to the current element 01848 const XMLRefInfo& curRef = refEnum.nextElement(); 01849 01850 // If its used but not declared, then its an error 01851 if (!curRef.getDeclared() && curRef.getUsed() && fValidate) 01852 fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName()); 01853 } 01854 } 01855 01856 01857 // This just does a simple check that the passed progressive scan token is 01858 // legal for this scanner. 01859 bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck) 01860 { 01861 return ((fScannerId == toCheck.fScannerId) 01862 && (fSequenceId == toCheck.fSequenceId)); 01863 } 01864 01865 01866 // This method will handle figuring out what the next top level token is 01867 // in the input stream. It will return an enumerated value that indicates 01868 // what it believes the next XML level token must be. It will eat as many 01869 // chars are required to figure out what is next. 01870 XMLScanner::XMLTokens XMLScanner::senseNextToken(XMLSize_t& orgReader) 01871 { 01872 // Get the next character and use it to guesstimate what the next token 01873 // is going to be. We turn on end of entity exceptions when we do this 01874 // in order to catch the scenario where the current entity ended at 01875 // the > of some markup. 01876 XMLCh nextCh=0; 01877 01878 XMLReader* curReader=fReaderMgr.getCurrentReader(); 01879 // avoid setting up the ThrowEOEJanitor if we know that we have data in the current reader 01880 if(curReader && curReader->charsLeftInBuffer()>0) 01881 nextCh = fReaderMgr.peekNextChar(); 01882 else 01883 { 01884 ThrowEOEJanitor janMgr(&fReaderMgr, true); 01885 nextCh = fReaderMgr.peekNextChar(); 01886 } 01887 01888 // If it's not a '<' we must be in content (unless it's a EOF) 01889 // 01890 // This includes entity references '&' of some sort. These must 01891 // be character data because that's the only place a reference can 01892 // occur in content. 01893 if (nextCh != chOpenAngle) 01894 return nextCh?Token_CharData:Token_EOF; 01895 01896 // Ok it had to have been a '<' character. So get it out of the reader 01897 // and store the reader number where we saw it, passing it back to the 01898 // caller. 01899 fReaderMgr.getNextChar(); 01900 orgReader = fReaderMgr.getCurrentReaderNum(); 01901 01902 // Ok, so lets go through the things that it could be at this point which 01903 // are all some form of markup. 01904 switch(fReaderMgr.peekNextChar()) 01905 { 01906 case chForwardSlash: 01907 { 01908 fReaderMgr.getNextChar(); 01909 return Token_EndTag; 01910 } 01911 case chBang: 01912 { 01913 static const XMLCh gCDATAStr[] = 01914 { 01915 chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A 01916 , chLatin_T, chLatin_A, chNull 01917 }; 01918 01919 static const XMLCh gCommentString[] = 01920 { 01921 chBang, chDash, chDash, chNull 01922 }; 01923 01924 if (fReaderMgr.skippedString(gCDATAStr)) 01925 return Token_CData; 01926 01927 if (fReaderMgr.skippedString(gCommentString)) 01928 return Token_Comment; 01929 01930 emitError(XMLErrs::ExpectedCommentOrCDATA); 01931 return Token_Unknown; 01932 } 01933 case chQuestion: 01934 { 01935 // It must be a PI 01936 fReaderMgr.getNextChar(); 01937 return Token_PI; 01938 } 01939 } 01940 // Assume its an element name, so return with a start tag token. If it 01941 // turns out not to be, then it will fail when it cannot get a valid tag. 01942 return Token_StartTag; 01943 } 01944 01945 // --------------------------------------------------------------------------- 01946 // XMLScanner: Private parsing methods 01947 // --------------------------------------------------------------------------- 01948 01949 // This guy just scans out a single or double quoted string of characters. 01950 // It does not pass any judgement on the contents and assumes that it is 01951 // illegal to have another quote of the same kind inside the string's 01952 // contents. 01953 // 01954 // NOTE: This is for simple stuff like the strings in the XMLDecl which 01955 // cannot have any entities inside them. So this guy does not handle any 01956 // end of entity stuff. 01957 bool XMLScanner::getQuotedString(XMLBuffer& toFill) 01958 { 01959 // Reset the target buffer 01960 toFill.reset(); 01961 01962 // Get the next char which must be a single or double quote 01963 XMLCh quoteCh; 01964 if (!fReaderMgr.skipIfQuote(quoteCh)) 01965 return false; 01966 01967 XMLCh nextCh; 01968 // Get another char and see if it matches the starting quote char 01969 while ((nextCh=fReaderMgr.getNextChar())!=quoteCh) 01970 { 01971 // We should never get either an end of file null char here. If we 01972 // do, just fail. It will be handled more gracefully in the higher 01973 // level code that called us. 01974 if (!nextCh) 01975 return false; 01976 01977 // Else add it to the buffer 01978 toFill.append(nextCh); 01979 } 01980 return true; 01981 } 01982 01983 01984 // This method scans a character reference and returns the character that 01985 // was refered to. It assumes that we've already scanned the &# characters 01986 // that prefix the numeric code. 01987 bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second) 01988 { 01989 bool gotOne = false; 01990 unsigned int value = 0; 01991 01992 // Set the radix. Its supposed to be a lower case x if hex. But, in 01993 // order to recover well, we check for an upper and put out an error 01994 // for that. 01995 unsigned int radix = 10; 01996 if (fReaderMgr.skippedChar(chLatin_x)) 01997 { 01998 radix = 16; 01999 } 02000 else if (fReaderMgr.skippedChar(chLatin_X)) 02001 { 02002 emitError(XMLErrs::HexRadixMustBeLowerCase); 02003 radix = 16; 02004 } 02005 02006 while (true) 02007 { 02008 const XMLCh nextCh = fReaderMgr.peekNextChar(); 02009 02010 // Watch for EOF 02011 if (!nextCh) 02012 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 02013 02014 // Break out on the terminating semicolon 02015 if (nextCh == chSemiColon) 02016 { 02017 fReaderMgr.getNextChar(); 02018 break; 02019 } 02020 02021 // Convert this char to a binary value, or bail out if its not 02022 // one. 02023 unsigned int nextVal; 02024 if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9)) 02025 nextVal = (unsigned int)(nextCh - chDigit_0); 02026 else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F)) 02027 nextVal= (unsigned int)(10 + (nextCh - chLatin_A)); 02028 else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f)) 02029 nextVal = (unsigned int)(10 + (nextCh - chLatin_a)); 02030 else 02031 { 02032 // Return a zero 02033 toFill = 0; 02034 02035 // If we got at least a sigit, then do an unterminated ref error. 02036 // Else, do an expected a numerical ref thing. 02037 if (gotOne) 02038 emitError(XMLErrs::UnterminatedCharRef); 02039 else 02040 emitError(XMLErrs::ExpectedNumericalCharRef); 02041 02042 // Return failure 02043 return false; 02044 } 02045 02046 // Make sure its valid for the radix. If not, then just eat the 02047 // digit and go on after issueing an error. Else, update the 02048 // running value with this new digit. 02049 if (nextVal >= radix) 02050 { 02051 XMLCh tmpStr[2]; 02052 tmpStr[0] = nextCh; 02053 tmpStr[1] = chNull; 02054 emitError(XMLErrs::BadDigitForRadix, tmpStr); 02055 } 02056 else 02057 { 02058 value = (value * radix) + nextVal; 02059 // Guard against overflow. 02060 if (value > 0x10FFFF) { 02061 // Character reference was not in the valid range 02062 emitError(XMLErrs::InvalidCharacterRef); 02063 return false; 02064 } 02065 } 02066 02067 // Indicate that we got at least one good digit 02068 gotOne = true; 02069 02070 // And eat the last char 02071 fReaderMgr.getNextChar(); 02072 } 02073 02074 // Return the char (or chars) 02075 // And check if the character expanded is valid or not 02076 if (value >= 0x10000 && value <= 0x10FFFF) 02077 { 02078 value -= 0x10000; 02079 toFill = XMLCh((value >> 10) + 0xD800); 02080 second = XMLCh((value & 0x3FF) + 0xDC00); 02081 } 02082 else if (value <= 0xFFFD) 02083 { 02084 toFill = XMLCh(value); 02085 second = 0; 02086 if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) { 02087 // Character reference was not in the valid range 02088 emitError(XMLErrs::InvalidCharacterRef); 02089 return false; 02090 } 02091 } 02092 else { 02093 // Character reference was not in the valid range 02094 emitError(XMLErrs::InvalidCharacterRef); 02095 return false; 02096 } 02097 02098 return true; 02099 } 02100 02101 02102 // We get here after the '<!--' part of the comment. We scan past the 02103 // terminating '-->' It will calls the appropriate handler with the comment 02104 // text, if one is provided. A comment can be in either the document or 02105 // the DTD, so the fInDocument flag is used to know which handler to send 02106 // it to. 02107 void XMLScanner::scanComment() 02108 { 02109 02110 enum States 02111 { 02112 InText 02113 , OneDash 02114 , TwoDashes 02115 }; 02116 02117 // Get a buffer for this 02118 XMLBufBid bbComment(&fBufMgr); 02119 02120 // Get the comment text into a temp buffer. Be sure to use temp buffer 02121 // two here, since its to be used for stuff that is potentially longer 02122 // than just a name. 02123 States curState = InText; 02124 bool gotLeadingSurrogate = false; 02125 while (true) 02126 { 02127 // Get the next character 02128 const XMLCh nextCh = fReaderMgr.getNextChar(); 02129 02130 // Watch for an end of file 02131 if (!nextCh) 02132 { 02133 emitError(XMLErrs::UnterminatedComment); 02134 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 02135 } 02136 02137 // Check for correct surrogate pairs 02138 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 02139 { 02140 if (gotLeadingSurrogate) 02141 emitError(XMLErrs::Expected2ndSurrogateChar); 02142 else 02143 gotLeadingSurrogate = true; 02144 } 02145 else 02146 { 02147 if (gotLeadingSurrogate) 02148 { 02149 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 02150 emitError(XMLErrs::Expected2ndSurrogateChar); 02151 } 02152 // Its got to at least be a valid XML character 02153 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { 02154 02155 XMLCh tmpBuf[9]; 02156 XMLString::binToText 02157 ( 02158 nextCh 02159 , tmpBuf 02160 , 8 02161 , 16 02162 , fMemoryManager 02163 ); 02164 emitError(XMLErrs::InvalidCharacter, tmpBuf); 02165 } 02166 02167 gotLeadingSurrogate = false; 02168 } 02169 02170 if (curState == InText) 02171 { 02172 // If its a dash, go to OneDash state. Otherwise take as text 02173 if (nextCh == chDash) 02174 curState = OneDash; 02175 else 02176 bbComment.append(nextCh); 02177 } 02178 else if (curState == OneDash) 02179 { 02180 // If its another dash, then we change to the two dashes states. 02181 // Otherwise, we have to put in the deficit dash and the new 02182 // character and go back to InText. 02183 if (nextCh == chDash) 02184 { 02185 curState = TwoDashes; 02186 } 02187 else 02188 { 02189 bbComment.append(chDash); 02190 bbComment.append(nextCh); 02191 curState = InText; 02192 } 02193 } 02194 else if (curState == TwoDashes) 02195 { 02196 // The next character must be the closing bracket 02197 if (nextCh != chCloseAngle) 02198 { 02199 emitError(XMLErrs::IllegalSequenceInComment); 02200 fReaderMgr.skipPastChar(chCloseAngle); 02201 return; 02202 } 02203 break; 02204 } 02205 } 02206 02207 // If we have an available handler, call back with the comment. 02208 if (fDocHandler) 02209 { 02210 fDocHandler->docComment 02211 ( 02212 bbComment.getRawBuffer() 02213 ); 02214 } 02215 02216 //mark comment is seen within the current element 02217 if (! fElemStack.isEmpty()) 02218 fElemStack.setCommentOrPISeen(); 02219 02220 } 02221 02222 02223 // Most equal signs can have white space around them, so this little guy 02224 // just makes the calling code cleaner by eating whitespace. 02225 bool XMLScanner::scanEq(bool inDecl) 02226 { 02227 if(inDecl) 02228 { 02229 bool skippedSomething; 02230 fReaderMgr.skipPastSpaces(skippedSomething, inDecl); 02231 if (fReaderMgr.skippedChar(chEqual)) 02232 { 02233 fReaderMgr.skipPastSpaces(skippedSomething, inDecl); 02234 return true; 02235 } 02236 } 02237 else 02238 { 02239 fReaderMgr.skipPastSpaces(); 02240 if (fReaderMgr.skippedChar(chEqual)) 02241 { 02242 fReaderMgr.skipPastSpaces(); 02243 return true; 02244 } 02245 } 02246 return false; 02247 } 02248 02249 02250 XMLSize_t 02251 XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar) 02252 { 02253 fReaderMgr.getUpToCharOrWS(toFill, chEndChar); 02254 return toFill.getLen(); 02255 } 02256 02257 unsigned int *XMLScanner::getNewUIntPtr() 02258 { 02259 // this method hands back a new pointer initialized to 0 02260 unsigned int *retVal; 02261 if(fUIntPoolCol < 64) 02262 { 02263 retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol; 02264 fUIntPoolCol++; 02265 return retVal; 02266 } 02267 // time to grow the pool... 02268 if(fUIntPoolRow+1 == fUIntPoolRowTotal) 02269 { 02270 // and time to add some space for new rows: 02271 fUIntPoolRowTotal <<= 1; 02272 unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal ); 02273 memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *)); 02274 fMemoryManager->deallocate(fUIntPool); 02275 fUIntPool = newArray; 02276 // need to 0 out new elements we won't need: 02277 for (unsigned int i=fUIntPoolRow+2; i<fUIntPoolRowTotal; i++) 02278 fUIntPool[i] = 0; 02279 } 02280 // now to add a new row; we just made sure we have space 02281 fUIntPoolRow++; 02282 fUIntPool[fUIntPoolRow] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6); 02283 memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6); 02284 // point to next element 02285 fUIntPoolCol = 1; 02286 return fUIntPool[fUIntPoolRow]; 02287 } 02288 02289 void XMLScanner::resetUIntPool() 02290 { 02291 // to reuse the unsigned int pool--and the hashtables that use it-- 02292 // simply reinitialize everything to 0's 02293 for(unsigned int i = 0; i<= fUIntPoolRow; i++) 02294 memset(fUIntPool[i], 0, sizeof(unsigned int) << 6); 02295 } 02296 02297 void XMLScanner::recreateUIntPool() 02298 { 02299 // this allows a bloated unsigned int pool to be dispensed with 02300 02301 // first, delete old fUIntPool 02302 for (unsigned int i=0; i<=fUIntPoolRow; i++) 02303 { 02304 fMemoryManager->deallocate(fUIntPool[i]); 02305 } 02306 fMemoryManager->deallocate(fUIntPool); 02307 02308 fUIntPoolRow = fUIntPoolCol = 0; 02309 fUIntPoolRowTotal = 2; 02310 fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal); 02311 fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6); 02312 memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6); 02313 fUIntPool[1] = 0; 02314 } 02315 02316 unsigned int XMLScanner::resolvePrefix( const XMLCh* const prefix 02317 , const ElemStack::MapModes mode) 02318 { 02319 // 02320 // If the prefix is empty, and we are in attribute mode, then we assign 02321 // it to the empty namespace because the default namespace does not 02322 // apply to attributes. 02323 // 02324 if (!*prefix) 02325 { 02326 if(mode == ElemStack::Mode_Attribute) 02327 return fEmptyNamespaceId; 02328 } 02329 // Watch for the special namespace prefixes. We always map these to 02330 // special URIs. 'xml' gets mapped to the official URI that its defined 02331 // to map to by the NS spec. xmlns gets mapped to a special place holder 02332 // URI that we define (so that it maps to something checkable.) 02333 else 02334 { 02335 if (XMLString::equals(prefix, XMLUni::fgXMLNSString)) 02336 return fXMLNSNamespaceId; 02337 else if (XMLString::equals(prefix, XMLUni::fgXMLString)) 02338 return fXMLNamespaceId; 02339 } 02340 02341 // Ask the element stack to search up itself for a mapping for the 02342 // passed prefix. 02343 bool unknown; 02344 unsigned int uriId = fElemStack.mapPrefixToURI(prefix, unknown); 02345 02346 // If it was unknown, then the URI was faked in but we have to issue an error 02347 if (unknown) 02348 emitError(XMLErrs::UnknownPrefix, prefix); 02349 02350 // check to see if uriId is empty; in XML 1.1 an emptynamespace is okay unless 02351 // we are trying to use it. 02352 if (*prefix && 02353 mode == ElemStack::Mode_Element && 02354 fXMLVersion != XMLReader::XMLV1_0 && 02355 uriId == fElemStack.getEmptyNamespaceId()) 02356 emitError(XMLErrs::UnknownPrefix, prefix); 02357 02358 return uriId; 02359 } 02360 02361 unsigned int 02362 XMLScanner::resolveQName( const XMLCh* const qName 02363 , XMLBuffer& prefixBuf 02364 , const ElemStack::MapModes mode 02365 , int& prefixColonPos) 02366 { 02367 prefixColonPos = XMLString::indexOf(qName, chColon); 02368 return resolveQNameWithColon(qName, prefixBuf, mode, prefixColonPos); 02369 } 02370 02371 unsigned int 02372 XMLScanner::resolveQNameWithColon( const XMLCh* const qName 02373 , XMLBuffer& prefixBuf 02374 , const ElemStack::MapModes mode 02375 , const int prefixColonPos) 02376 { 02377 // Lets split out the qName into a URI and name buffer first. The URI 02378 // can be empty. 02379 if (prefixColonPos == -1) 02380 { 02381 // Its all name with no prefix, so put the whole thing into the name 02382 // buffer. Then map the empty string to a URI, since the empty string 02383 // represents the default namespace. This will either return some 02384 // explicit URI which the default namespace is mapped to, or the 02385 // the default global namespace. 02386 prefixBuf.reset(); 02387 return resolvePrefix(XMLUni::fgZeroLenString, mode); 02388 } 02389 else 02390 { 02391 // Copy the chars up to but not including the colon into the prefix 02392 // buffer. 02393 prefixBuf.set(qName, prefixColonPos); 02394 return resolvePrefix(prefixBuf.getRawBuffer(), mode); 02395 } 02396 } 02397 02398 XERCES_CPP_NAMESPACE_END