GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: DGXMLScanner.cpp 833045 2009-11-05 13:21:27Z borisk $ 00020 */ 00021 00022 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/internal/DGXMLScanner.hpp> 00027 #include <xercesc/util/Janitor.hpp> 00028 #include <xercesc/util/RuntimeException.hpp> 00029 #include <xercesc/util/UnexpectedEOFException.hpp> 00030 #include <xercesc/util/XMLUri.hpp> 00031 #include <xercesc/framework/URLInputSource.hpp> 00032 #include <xercesc/framework/LocalFileInputSource.hpp> 00033 #include <xercesc/framework/XMLDocumentHandler.hpp> 00034 #include <xercesc/framework/XMLEntityHandler.hpp> 00035 #include <xercesc/framework/XMLPScanToken.hpp> 00036 #include <xercesc/framework/XMLGrammarPool.hpp> 00037 #include <xercesc/framework/XMLDTDDescription.hpp> 00038 #include <xercesc/internal/EndOfEntityException.hpp> 00039 #include <xercesc/validators/common/GrammarResolver.hpp> 00040 #include <xercesc/validators/DTD/DocTypeHandler.hpp> 00041 #include <xercesc/validators/DTD/DTDScanner.hpp> 00042 #include <xercesc/validators/DTD/DTDValidator.hpp> 00043 #include <xercesc/util/OutOfMemoryException.hpp> 00044 #include <xercesc/util/XMLResourceIdentifier.hpp> 00045 00046 XERCES_CPP_NAMESPACE_BEGIN 00047 00048 00049 typedef JanitorMemFunCall<DGXMLScanner> CleanupType; 00050 typedef JanitorMemFunCall<ReaderMgr> ReaderMgrResetType; 00051 00052 00053 // --------------------------------------------------------------------------- 00054 // DGXMLScanner: Constructors and Destructor 00055 // --------------------------------------------------------------------------- 00056 DGXMLScanner::DGXMLScanner(XMLValidator* const valToAdopt 00057 , GrammarResolver* const grammarResolver 00058 , MemoryManager* const manager) : 00059 00060 XMLScanner(valToAdopt, grammarResolver, manager) 00061 , fAttrNSList(0) 00062 , fDTDValidator(0) 00063 , fDTDGrammar(0) 00064 , fDTDElemNonDeclPool(0) 00065 , fElemCount(0) 00066 , fAttDefRegistry(0) 00067 , fUndeclaredAttrRegistry(0) 00068 { 00069 CleanupType cleanup(this, &DGXMLScanner::cleanUp); 00070 00071 try 00072 { 00073 commonInit(); 00074 } 00075 catch(const OutOfMemoryException&) 00076 { 00077 // Don't cleanup when out of memory, since executing the 00078 // code can cause problems. 00079 cleanup.release(); 00080 00081 throw; 00082 } 00083 00084 cleanup.release(); 00085 } 00086 00087 DGXMLScanner::DGXMLScanner( XMLDocumentHandler* const docHandler 00088 , DocTypeHandler* const docTypeHandler 00089 , XMLEntityHandler* const entityHandler 00090 , XMLErrorReporter* const errHandler 00091 , XMLValidator* const valToAdopt 00092 , GrammarResolver* const grammarResolver 00093 , MemoryManager* const manager) : 00094 00095 XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) 00096 , fAttrNSList(0) 00097 , fDTDValidator(0) 00098 , fDTDGrammar(0) 00099 , fDTDElemNonDeclPool(0) 00100 , fElemCount(0) 00101 , fAttDefRegistry(0) 00102 , fUndeclaredAttrRegistry(0) 00103 { 00104 CleanupType cleanup(this, &DGXMLScanner::cleanUp); 00105 00106 try 00107 { 00108 commonInit(); 00109 } 00110 catch(const OutOfMemoryException&) 00111 { 00112 // Don't cleanup when out of memory, since executing the 00113 // code can cause problems. 00114 cleanup.release(); 00115 00116 throw; 00117 } 00118 00119 cleanup.release(); 00120 } 00121 00122 DGXMLScanner::~DGXMLScanner() 00123 { 00124 cleanUp(); 00125 } 00126 00127 // --------------------------------------------------------------------------- 00128 // XMLScanner: Getter methods 00129 // --------------------------------------------------------------------------- 00130 NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool() 00131 { 00132 if(!fGrammar) 00133 return 0; 00134 return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); 00135 } 00136 00137 const NameIdPool<DTDEntityDecl>* DGXMLScanner::getEntityDeclPool() const 00138 { 00139 if(!fGrammar) 00140 return 0; 00141 return ((DTDGrammar*)fGrammar)->getEntityDeclPool(); 00142 } 00143 00144 // --------------------------------------------------------------------------- 00145 // DGXMLScanner: Main entry point to scan a document 00146 // --------------------------------------------------------------------------- 00147 void DGXMLScanner::scanDocument(const InputSource& src) 00148 { 00149 // Bump up the sequence id for this parser instance. This will invalidate 00150 // any previous progressive scan tokens. 00151 fSequenceId++; 00152 00153 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 00154 00155 try 00156 { 00157 // Reset the scanner and its plugged in stuff for a new run. This 00158 // resets all the data structures, creates the initial reader and 00159 // pushes it on the stack, and sets up the base document path. 00160 scanReset(src); 00161 00162 // If we have a document handler, then call the start document 00163 if (fDocHandler) 00164 fDocHandler->startDocument(); 00165 00166 // Scan the prolog part, which is everything before the root element 00167 // including the DTD subsets. 00168 scanProlog(); 00169 00170 // If we got to the end of input, then its not a valid XML file. 00171 // Else, go on to scan the content. 00172 if (fReaderMgr.atEOF()) 00173 { 00174 emitError(XMLErrs::EmptyMainEntity); 00175 } 00176 else 00177 { 00178 // Scan content, and tell it its not an external entity 00179 if (scanContent()) 00180 { 00181 // Do post-parse validation if required 00182 if (fValidate) 00183 { 00184 // We handle ID reference semantics at this level since 00185 // its required by XML 1.0. 00186 checkIDRefs(); 00187 00188 // Then allow the validator to do any extra stuff it wants 00189 // fValidator->postParseValidation(); 00190 } 00191 00192 // That went ok, so scan for any miscellaneous stuff 00193 if (!fReaderMgr.atEOF()) 00194 scanMiscellaneous(); 00195 } 00196 } 00197 00198 // If we have a document handler, then call the end document 00199 if (fDocHandler) 00200 fDocHandler->endDocument(); 00201 } 00202 // NOTE: 00203 // 00204 // In all of the error processing below, the emitError() call MUST come 00205 // before the flush of the reader mgr, or it will fail because it tries 00206 // to find out the position in the XML source of the error. 00207 catch(const XMLErrs::Codes) 00208 { 00209 // This is a 'first failure' exception, so fall through 00210 } 00211 catch(const XMLValid::Codes) 00212 { 00213 // This is a 'first fatal error' type exit, so fall through 00214 } 00215 catch(const XMLException& excToCatch) 00216 { 00217 // Emit the error and catch any user exception thrown from here. Make 00218 // sure in all cases we flush the reader manager. 00219 fInException = true; 00220 try 00221 { 00222 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00223 emitError 00224 ( 00225 XMLErrs::XMLException_Warning 00226 , excToCatch.getCode() 00227 , excToCatch.getMessage() 00228 ); 00229 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00230 emitError 00231 ( 00232 XMLErrs::XMLException_Fatal 00233 , excToCatch.getCode() 00234 , excToCatch.getMessage() 00235 ); 00236 else 00237 emitError 00238 ( 00239 XMLErrs::XMLException_Error 00240 , excToCatch.getCode() 00241 , excToCatch.getMessage() 00242 ); 00243 } 00244 catch(const OutOfMemoryException&) 00245 { 00246 // This is a special case for out-of-memory 00247 // conditions, because resetting the ReaderMgr 00248 // can be problematic. 00249 resetReaderMgr.release(); 00250 00251 throw; 00252 } 00253 } 00254 catch(const OutOfMemoryException&) 00255 { 00256 // This is a special case for out-of-memory 00257 // conditions, because resetting the ReaderMgr 00258 // can be problematic. 00259 resetReaderMgr.release(); 00260 00261 throw; 00262 } 00263 } 00264 00265 00266 bool DGXMLScanner::scanNext(XMLPScanToken& token) 00267 { 00268 // Make sure this token is still legal 00269 if (!isLegalToken(token)) 00270 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); 00271 00272 // Find the next token and remember the reader id 00273 XMLSize_t orgReader; 00274 XMLTokens curToken; 00275 00276 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 00277 00278 bool retVal = true; 00279 00280 try 00281 { 00282 while (true) 00283 { 00284 // We have to handle any end of entity exceptions that happen here. 00285 // We could be at the end of X nested entities, each of which will 00286 // generate an end of entity exception as we try to move forward. 00287 try 00288 { 00289 curToken = senseNextToken(orgReader); 00290 break; 00291 } 00292 catch(const EndOfEntityException& toCatch) 00293 { 00294 // Send an end of entity reference event 00295 if (fDocHandler) 00296 fDocHandler->endEntityReference(toCatch.getEntity()); 00297 } 00298 } 00299 00300 if (curToken == Token_CharData) 00301 { 00302 scanCharData(fCDataBuf); 00303 } 00304 else if (curToken == Token_EOF) 00305 { 00306 if (!fElemStack.isEmpty()) 00307 { 00308 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00309 emitError 00310 ( 00311 XMLErrs::EndedWithTagsOnStack 00312 , topElem->fThisElement->getFullName() 00313 ); 00314 } 00315 00316 retVal = false; 00317 } 00318 else 00319 { 00320 // Its some sort of markup 00321 bool gotData = true; 00322 switch(curToken) 00323 { 00324 case Token_CData : 00325 // Make sure we are within content 00326 if (fElemStack.isEmpty()) 00327 emitError(XMLErrs::CDATAOutsideOfContent); 00328 scanCDSection(); 00329 break; 00330 00331 case Token_Comment : 00332 scanComment(); 00333 break; 00334 00335 case Token_EndTag : 00336 scanEndTag(gotData); 00337 break; 00338 00339 case Token_PI : 00340 scanPI(); 00341 break; 00342 00343 case Token_StartTag : 00344 if (fDoNamespaces) 00345 scanStartTagNS(gotData); 00346 else 00347 scanStartTag(gotData); 00348 break; 00349 00350 default : 00351 fReaderMgr.skipToChar(chOpenAngle); 00352 break; 00353 } 00354 00355 if (orgReader != fReaderMgr.getCurrentReaderNum()) 00356 emitError(XMLErrs::PartialMarkupInEntity); 00357 00358 // If we hit the end, then do the miscellaneous part 00359 if (!gotData) 00360 { 00361 // Do post-parse validation if required 00362 if (fValidate) 00363 { 00364 // We handle ID reference semantics at this level since 00365 // its required by XML 1.0. 00366 checkIDRefs(); 00367 00368 // Then allow the validator to do any extra stuff it wants 00369 // fValidator->postParseValidation(); 00370 } 00371 00372 // That went ok, so scan for any miscellaneous stuff 00373 scanMiscellaneous(); 00374 00375 if (fDocHandler) 00376 fDocHandler->endDocument(); 00377 } 00378 } 00379 } 00380 // NOTE: 00381 // 00382 // In all of the error processing below, the emitError() call MUST come 00383 // before the flush of the reader mgr, or it will fail because it tries 00384 // to find out the position in the XML source of the error. 00385 catch(const XMLErrs::Codes) 00386 { 00387 // This is a 'first failure' exception, so return failure 00388 retVal = false; 00389 } 00390 catch(const XMLValid::Codes) 00391 { 00392 // This is a 'first fatal error' type exit, so return failure 00393 retVal = false; 00394 } 00395 catch(const XMLException& excToCatch) 00396 { 00397 // Emit the error and catch any user exception thrown from here. Make 00398 // sure in all cases we flush the reader manager. 00399 fInException = true; 00400 try 00401 { 00402 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00403 emitError 00404 ( 00405 XMLErrs::XMLException_Warning 00406 , excToCatch.getCode() 00407 , excToCatch.getMessage() 00408 ); 00409 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00410 emitError 00411 ( 00412 XMLErrs::XMLException_Fatal 00413 , excToCatch.getCode() 00414 , excToCatch.getMessage() 00415 ); 00416 else 00417 emitError 00418 ( 00419 XMLErrs::XMLException_Error 00420 , excToCatch.getCode() 00421 , excToCatch.getMessage() 00422 ); 00423 } 00424 catch(const OutOfMemoryException&) 00425 { 00426 // This is a special case for out-of-memory 00427 // conditions, because resetting the ReaderMgr 00428 // can be problematic. 00429 resetReaderMgr.release(); 00430 00431 throw; 00432 } 00433 00434 retVal = false; 00435 } 00436 catch(const OutOfMemoryException&) 00437 { 00438 // This is a special case for out-of-memory 00439 // conditions, because resetting the ReaderMgr 00440 // can be problematic. 00441 resetReaderMgr.release(); 00442 00443 throw; 00444 } 00445 00446 // If we are not at the end, release the object that will 00447 // reset the ReaderMgr. 00448 if (retVal) 00449 resetReaderMgr.release(); 00450 00451 return retVal; 00452 } 00453 00454 00455 // --------------------------------------------------------------------------- 00456 // DGXMLScanner: Private scanning methods 00457 // --------------------------------------------------------------------------- 00458 00459 // This method will kick off the scanning of the primary content of the 00460 // document, i.e. the elements. 00461 bool DGXMLScanner::scanContent() 00462 { 00463 // Go into a loop until we hit the end of the root element, or we fall 00464 // out because there is no root element. 00465 // 00466 // We have to do kind of a deeply nested double loop here in order to 00467 // avoid doing the setup/teardown of the exception handler on each 00468 // round. Doing it this way we only do it when an exception actually 00469 // occurs. 00470 bool gotData = true; 00471 bool inMarkup = false; 00472 while (gotData) 00473 { 00474 try 00475 { 00476 while (gotData) 00477 { 00478 // Sense what the next top level token is. According to what 00479 // this tells us, we will call something to handle that kind 00480 // of thing. 00481 XMLSize_t orgReader; 00482 const XMLTokens curToken = senseNextToken(orgReader); 00483 00484 // Handle character data and end of file specially. Char data 00485 // is not markup so we don't want to handle it in the loop 00486 // below. 00487 if (curToken == Token_CharData) 00488 { 00489 // Scan the character data and call appropriate events. Let 00490 // him use our local character data buffer for efficiency. 00491 scanCharData(fCDataBuf); 00492 continue; 00493 } 00494 else if (curToken == Token_EOF) 00495 { 00496 // The element stack better be empty at this point or we 00497 // ended prematurely before all elements were closed. 00498 if (!fElemStack.isEmpty()) 00499 { 00500 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00501 emitError 00502 ( 00503 XMLErrs::EndedWithTagsOnStack 00504 , topElem->fThisElement->getFullName() 00505 ); 00506 } 00507 00508 // Its the end of file, so clear the got data flag 00509 gotData = false; 00510 continue; 00511 } 00512 00513 // We are in some sort of markup now 00514 inMarkup = true; 00515 00516 // According to the token we got, call the appropriate 00517 // scanning method. 00518 switch(curToken) 00519 { 00520 case Token_CData : 00521 // Make sure we are within content 00522 if (fElemStack.isEmpty()) 00523 emitError(XMLErrs::CDATAOutsideOfContent); 00524 scanCDSection(); 00525 break; 00526 00527 case Token_Comment : 00528 scanComment(); 00529 break; 00530 00531 case Token_EndTag : 00532 scanEndTag(gotData); 00533 break; 00534 00535 case Token_PI : 00536 scanPI(); 00537 break; 00538 00539 case Token_StartTag : 00540 if (fDoNamespaces) 00541 scanStartTagNS(gotData); 00542 else 00543 scanStartTag(gotData); 00544 break; 00545 00546 default : 00547 fReaderMgr.skipToChar(chOpenAngle); 00548 break; 00549 } 00550 00551 if (orgReader != fReaderMgr.getCurrentReaderNum()) 00552 emitError(XMLErrs::PartialMarkupInEntity); 00553 00554 // And we are back out of markup again 00555 inMarkup = false; 00556 } 00557 } 00558 catch(const EndOfEntityException& toCatch) 00559 { 00560 // If we were in some markup when this happened, then its a 00561 // partial markup error. 00562 if (inMarkup) 00563 emitError(XMLErrs::PartialMarkupInEntity); 00564 00565 // Send an end of entity reference event 00566 if (fDocHandler) 00567 fDocHandler->endEntityReference(toCatch.getEntity()); 00568 00569 inMarkup = false; 00570 } 00571 } 00572 00573 // It went ok, so return success 00574 return true; 00575 } 00576 00577 00578 void DGXMLScanner::scanEndTag(bool& gotData) 00579 { 00580 // Assume we will still have data until proven otherwise. It will only 00581 // ever be false if this is the end of the root element. 00582 gotData = true; 00583 00584 // Check if the element stack is empty. If so, then this is an unbalanced 00585 // element (i.e. more ends than starts, perhaps because of bad text 00586 // causing one to be skipped.) 00587 if (fElemStack.isEmpty()) 00588 { 00589 emitError(XMLErrs::MoreEndThanStartTags); 00590 fReaderMgr.skipPastChar(chCloseAngle); 00591 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); 00592 } 00593 00594 // Pop the stack of the element we are supposed to be ending. Remember 00595 // that we don't own this. The stack just keeps them and reuses them. 00596 unsigned int uriId = (fDoNamespaces) 00597 ? fElemStack.getCurrentURI() : fEmptyNamespaceId; 00598 00599 // Pop the stack of the element we are supposed to be ending. Remember 00600 // that we don't own this. The stack just keeps them and reuses them. 00601 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00602 XMLElementDecl *tempElement = topElem->fThisElement; 00603 00604 // See if it was the root element, to avoid multiple calls below 00605 const bool isRoot = fElemStack.isEmpty(); 00606 00607 // Make sure that its the end of the element that we expect 00608 if (!fReaderMgr.skippedStringLong(tempElement->getFullName())) 00609 { 00610 emitError 00611 ( 00612 XMLErrs::ExpectedEndOfTagX 00613 , tempElement->getFullName() 00614 ); 00615 fReaderMgr.skipPastChar(chCloseAngle); 00616 return; 00617 } 00618 00619 // Make sure we are back on the same reader as where we started 00620 if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) 00621 emitError(XMLErrs::PartialTagMarkupError); 00622 00623 // Skip optional whitespace 00624 fReaderMgr.skipPastSpaces(); 00625 00626 // Make sure we find the closing bracket 00627 if (!fReaderMgr.skippedChar(chCloseAngle)) 00628 { 00629 emitError 00630 ( 00631 XMLErrs::UnterminatedEndTag 00632 , topElem->fThisElement->getFullName() 00633 ); 00634 } 00635 00636 // If validation is enabled, then lets pass him the list of children and 00637 // this element and let him validate it. 00638 if (fValidate) 00639 { 00640 00641 // 00642 // XML1.0-3rd 00643 // Validity Constraint: 00644 // The declaration matches EMPTY and the element has no content (not even 00645 // entity references, comments, PIs or white space). 00646 // 00647 if ( (topElem->fCommentOrPISeen) && 00648 (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Empty)) 00649 { 00650 fValidator->emitError 00651 ( 00652 XMLValid::EmptyElemHasContent 00653 , topElem->fThisElement->getFullName() 00654 ); 00655 } 00656 00657 // 00658 // XML1.0-3rd 00659 // Validity Constraint: 00660 // 00661 // The declaration matches children and the sequence of child elements 00662 // belongs to the language generated by the regular expression in the 00663 // content model, with optional white space, comments and PIs 00664 // (i.e. markup matching production [27] Misc) between the start-tag and 00665 // the first child element, between child elements, or between the last 00666 // child element and the end-tag. 00667 // 00668 // Note that 00669 // a CDATA section containing only white space or 00670 // a reference to an entity whose replacement text is character references 00671 // expanding to white space do not match the nonterminal S, and hence 00672 // cannot appear in these positions; however, 00673 // a reference to an internal entity with a literal value consisting 00674 // of character references expanding to white space does match S, 00675 // since its replacement text is the white space resulting from expansion 00676 // of the character references. 00677 // 00678 if ( (topElem->fReferenceEscaped) && 00679 (((DTDElementDecl*) topElem->fThisElement)->getModelType() == DTDElementDecl::Children)) 00680 { 00681 fValidator->emitError 00682 ( 00683 XMLValid::ElemChildrenHasInvalidWS 00684 , topElem->fThisElement->getFullName() 00685 ); 00686 } 00687 00688 XMLSize_t failure; 00689 bool res = fValidator->checkContent 00690 ( 00691 topElem->fThisElement 00692 , topElem->fChildren 00693 , topElem->fChildCount 00694 , &failure 00695 ); 00696 00697 if (!res) 00698 { 00699 // One of the elements is not valid for the content. NOTE that 00700 // if no children were provided but the content model requires 00701 // them, it comes back with a zero value. But we cannot use that 00702 // to index the child array in this case, and have to put out a 00703 // special message. 00704 if (!topElem->fChildCount) 00705 { 00706 fValidator->emitError 00707 ( 00708 XMLValid::EmptyNotValidForContent 00709 , topElem->fThisElement->getFormattedContentModel() 00710 ); 00711 } 00712 else if (failure >= topElem->fChildCount) 00713 { 00714 fValidator->emitError 00715 ( 00716 XMLValid::NotEnoughElemsForCM 00717 , topElem->fThisElement->getFormattedContentModel() 00718 ); 00719 } 00720 else 00721 { 00722 fValidator->emitError 00723 ( 00724 XMLValid::ElementNotValidForContent 00725 , topElem->fChildren[failure]->getRawName() 00726 , topElem->fThisElement->getFormattedContentModel() 00727 ); 00728 } 00729 } 00730 } 00731 00732 // If we have a doc handler, tell it about the end tag 00733 if (fDocHandler) 00734 { 00735 fDocHandler->endElement 00736 ( 00737 *topElem->fThisElement 00738 , uriId 00739 , isRoot 00740 , (fDoNamespaces) 00741 ? topElem->fThisElement->getElementName()->getPrefix() 00742 : XMLUni::fgZeroLenString 00743 ); 00744 } 00745 00746 // If this was the root, then done with content 00747 gotData = !isRoot; 00748 } 00749 00750 00751 // This method handles the high level logic of scanning the DOCType 00752 // declaration. This calls the DTDScanner and kicks off both the scanning of 00753 // the internal subset and the scanning of the external subset, if any. 00754 // 00755 // When we get here the '<!DOCTYPE' part has already been scanned, which is 00756 // what told us that we had a doc type decl to parse. 00757 void DGXMLScanner::scanDocTypeDecl() 00758 { 00759 if (fDocTypeHandler) 00760 fDocTypeHandler->resetDocType(); 00761 00762 // There must be some space after DOCTYPE 00763 bool skippedSomething; 00764 fReaderMgr.skipPastSpaces(skippedSomething); 00765 if (!skippedSomething) 00766 { 00767 emitError(XMLErrs::ExpectedWhitespace); 00768 00769 // Just skip the Doctype declaration and return 00770 fReaderMgr.skipPastChar(chCloseAngle); 00771 return; 00772 } 00773 00774 // Get a buffer for the root element 00775 XMLBufBid bbRootName(&fBufMgr); 00776 00777 // Get a name from the input, which should be the name of the root 00778 // element of the upcoming content. 00779 int colonPosition; 00780 bool validName = fDoNamespaces ? fReaderMgr.getQName(bbRootName.getBuffer(), &colonPosition) : 00781 fReaderMgr.getName(bbRootName.getBuffer()); 00782 if (!validName) 00783 { 00784 if (bbRootName.isEmpty()) 00785 emitError(XMLErrs::NoRootElemInDOCTYPE); 00786 else 00787 emitError(XMLErrs::InvalidRootElemInDOCTYPE, bbRootName.getRawBuffer()); 00788 fReaderMgr.skipPastChar(chCloseAngle); 00789 return; 00790 } 00791 00792 // Store the root element name for later check 00793 setRootElemName(bbRootName.getRawBuffer()); 00794 00795 // This element obviously is not going to exist in the element decl 00796 // pool yet, but we need to call docTypeDecl. So force it into 00797 // the element decl pool, marked as being there because it was in 00798 // the DOCTYPE. Later, when its declared, the status will be updated. 00799 // 00800 // Only do this if we are not reusing the validator! If we are reusing, 00801 // then look it up instead. It has to exist! 00802 MemoryManager* const rootDeclMgr = 00803 fUseCachedGrammar ? fMemoryManager : fGrammarPoolMemoryManager; 00804 00805 DTDElementDecl* rootDecl = new (rootDeclMgr) DTDElementDecl 00806 ( 00807 bbRootName.getRawBuffer() 00808 , fEmptyNamespaceId 00809 , DTDElementDecl::Any 00810 , rootDeclMgr 00811 ); 00812 00813 Janitor<DTDElementDecl> rootDeclJanitor(rootDecl); 00814 rootDecl->setCreateReason(DTDElementDecl::AsRootElem); 00815 rootDecl->setExternalElemDeclaration(true); 00816 if(!fUseCachedGrammar) 00817 { 00818 fGrammar->putElemDecl(rootDecl); 00819 rootDeclJanitor.release(); 00820 } else 00821 { 00822 // put this in the undeclared pool so it gets deleted... 00823 XMLElementDecl* elemDecl = fDTDElemNonDeclPool->getByKey(bbRootName.getRawBuffer()); 00824 if (elemDecl) 00825 { 00826 rootDecl->setId(elemDecl->getId()); 00827 } 00828 else 00829 { 00830 rootDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)rootDecl)); 00831 rootDeclJanitor.release(); 00832 } 00833 } 00834 00835 // Skip any spaces after the name 00836 fReaderMgr.skipPastSpaces(); 00837 00838 // And now if we are looking at a >, then we are done. It is not 00839 // required to have an internal or external subset, though why you 00840 // would not escapes me. 00841 if (fReaderMgr.skippedChar(chCloseAngle)) { 00842 00843 // If we have a doc type handler and advanced callbacks are enabled, 00844 // call the doctype event. 00845 if (fDocTypeHandler) 00846 fDocTypeHandler->doctypeDecl(*rootDecl, 0, 0, false); 00847 return; 00848 } 00849 00850 // either internal/external subset 00851 if (fValScheme == Val_Auto && !fValidate) 00852 fValidate = true; 00853 00854 bool hasIntSubset = false; 00855 bool hasExtSubset = false; 00856 XMLCh* sysId = 0; 00857 XMLCh* pubId = 0; 00858 00859 DTDScanner dtdScanner 00860 ( 00861 (DTDGrammar*) fGrammar 00862 , fDocTypeHandler 00863 , fGrammarPoolMemoryManager 00864 , fMemoryManager 00865 ); 00866 dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); 00867 00868 // If the next character is '[' then we have no external subset cause 00869 // there is no system id, just the opening character of the internal 00870 // subset. Else, has to be an id. 00871 // 00872 // Just look at the next char, don't eat it. 00873 if (fReaderMgr.peekNextChar() == chOpenSquare) 00874 { 00875 hasIntSubset = true; 00876 } 00877 else 00878 { 00879 // Indicate we have an external subset 00880 hasExtSubset = true; 00881 fHasNoDTD = false; 00882 00883 // Get buffers for the ids 00884 XMLBufBid bbPubId(&fBufMgr); 00885 XMLBufBid bbSysId(&fBufMgr); 00886 00887 // Get the external subset id 00888 if (!dtdScanner.scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), DTDScanner::IDType_External)) 00889 { 00890 fReaderMgr.skipPastChar(chCloseAngle); 00891 return; 00892 } 00893 00894 // Get copies of the ids we got 00895 pubId = XMLString::replicate(bbPubId.getRawBuffer(), fMemoryManager); 00896 sysId = XMLString::replicate(bbSysId.getRawBuffer(), fMemoryManager); 00897 00898 // Skip spaces and check again for the opening of an internal subset 00899 fReaderMgr.skipPastSpaces(); 00900 00901 // Just look at the next char, don't eat it. 00902 if (fReaderMgr.peekNextChar() == chOpenSquare) { 00903 hasIntSubset = true; 00904 } 00905 } 00906 00907 // Insure that the ids get cleaned up, if they got allocated 00908 ArrayJanitor<XMLCh> janSysId(sysId, fMemoryManager); 00909 ArrayJanitor<XMLCh> janPubId(pubId, fMemoryManager); 00910 00911 // If we have a doc type handler and advanced callbacks are enabled, 00912 // call the doctype event. 00913 if (fDocTypeHandler) 00914 fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset, hasExtSubset); 00915 00916 // Ok, if we had an internal subset, we are just past the [ character 00917 // and need to parse that first. 00918 if (hasIntSubset) 00919 { 00920 // Eat the opening square bracket 00921 fReaderMgr.getNextChar(); 00922 00923 checkInternalDTD(hasExtSubset, sysId, pubId); 00924 00925 // And try to scan the internal subset. If we fail, try to recover 00926 // by skipping forward tot he close angle and returning. 00927 if (!dtdScanner.scanInternalSubset()) 00928 { 00929 fReaderMgr.skipPastChar(chCloseAngle); 00930 return; 00931 } 00932 00933 // Do a sanity check that some expanded PE did not propogate out of 00934 // the doctype. This could happen if it was terminated early by bad 00935 // syntax. 00936 if (fReaderMgr.getReaderDepth() > 1) 00937 { 00938 emitError(XMLErrs::PEPropogated); 00939 00940 // Ask the reader manager to pop back down to the main level 00941 fReaderMgr.cleanStackBackTo(1); 00942 } 00943 00944 fReaderMgr.skipPastSpaces(); 00945 } 00946 00947 // And that should leave us at the closing > of the DOCTYPE line 00948 if (!fReaderMgr.skippedChar(chCloseAngle)) 00949 { 00950 // Do a special check for the common scenario of an extra ] char at 00951 // the end. This is easy to recover from. 00952 if (fReaderMgr.skippedChar(chCloseSquare) 00953 && fReaderMgr.skippedChar(chCloseAngle)) 00954 { 00955 emitError(XMLErrs::ExtraCloseSquare); 00956 } 00957 else 00958 { 00959 emitError(XMLErrs::UnterminatedDOCTYPE); 00960 fReaderMgr.skipPastChar(chCloseAngle); 00961 } 00962 } 00963 00964 // If we had an external subset, then we need to deal with that one 00965 // next. If we are reusing the validator, then don't scan it. 00966 if (hasExtSubset) { 00967 00968 InputSource* srcUsed=0; 00969 Janitor<InputSource> janSrc(srcUsed); 00970 // If we had an internal subset and we're using the cached grammar, it 00971 // means that the ignoreCachedDTD is set, so we ignore the cached 00972 // grammar 00973 if (fUseCachedGrammar && !hasIntSubset) 00974 { 00975 srcUsed = resolveSystemId(sysId, pubId); 00976 if (srcUsed) { 00977 janSrc.reset(srcUsed); 00978 Grammar* grammar = fGrammarResolver->getGrammar(srcUsed->getSystemId()); 00979 00980 if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) { 00981 00982 fDTDGrammar = (DTDGrammar*) grammar; 00983 fGrammar = fDTDGrammar; 00984 fValidator->setGrammar(fGrammar); 00985 // If we don't report at least the external subset boundaries, 00986 // an advanced document handler cannot know when the DTD end, 00987 // since we've already sent a doctype decl that indicates there's 00988 // there's an external subset. 00989 if (fDocTypeHandler) 00990 { 00991 fDocTypeHandler->startExtSubset(); 00992 fDocTypeHandler->endExtSubset(); 00993 } 00994 00995 return; 00996 } 00997 } 00998 } 00999 01000 if (fLoadExternalDTD || fValidate) 01001 { 01002 // And now create a reader to read this entity 01003 XMLReader* reader; 01004 if(srcUsed) { 01005 reader = fReaderMgr.createReader 01006 ( 01007 *srcUsed 01008 , false 01009 , XMLReader::RefFrom_NonLiteral 01010 , XMLReader::Type_General 01011 , XMLReader::Source_External 01012 , fCalculateSrcOfs 01013 , fLowWaterMark 01014 ); 01015 } 01016 else { 01017 reader = fReaderMgr.createReader 01018 ( 01019 sysId 01020 , pubId 01021 , false 01022 , XMLReader::RefFrom_NonLiteral 01023 , XMLReader::Type_General 01024 , XMLReader::Source_External 01025 , srcUsed 01026 , fCalculateSrcOfs 01027 , fLowWaterMark 01028 , fDisableDefaultEntityResolution 01029 ); 01030 janSrc.reset(srcUsed); 01031 } 01032 // If it failed then throw an exception 01033 if (!reader) 01034 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenDTD, srcUsed ? srcUsed->getSystemId() : sysId, fMemoryManager); 01035 01036 if (fToCacheGrammar) { 01037 01038 unsigned int stringId = fGrammarResolver->getStringPool()->addOrFind(srcUsed->getSystemId()); 01039 const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(stringId); 01040 01041 fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); 01042 ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); 01043 fGrammarResolver->putGrammar(fGrammar); 01044 } 01045 01046 // In order to make the processing work consistently, we have to 01047 // make this look like an external entity. So create an entity 01048 // decl and fill it in and push it with the reader, as happens 01049 // with an external entity. Put a janitor on it to insure it gets 01050 // cleaned up. The reader manager does not adopt them. 01051 const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; 01052 DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); 01053 declDTD->setSystemId(sysId); 01054 declDTD->setIsExternal(true); 01055 Janitor<DTDEntityDecl> janDecl(declDTD); 01056 01057 // Mark this one as a throw at end 01058 reader->setThrowAtEnd(true); 01059 01060 // And push it onto the stack, with its pseudo name 01061 fReaderMgr.pushReader(reader, declDTD); 01062 01063 // Tell it its not in an include section 01064 dtdScanner.scanExtSubsetDecl(false, true); 01065 } 01066 } 01067 } 01068 01069 bool DGXMLScanner::scanStartTag(bool& gotData) 01070 { 01071 // Assume we will still have data until proven otherwise. It will only 01072 // ever be false if this is the root and its empty. 01073 gotData = true; 01074 01075 // Get the QName. In this case, we are not doing namespaces, so we just 01076 // use it as is and don't have to break it into parts. 01077 01078 bool validName = fReaderMgr.getName(fQNameBuf); 01079 if (!validName) 01080 { 01081 if (fQNameBuf.isEmpty()) 01082 emitError(XMLErrs::ExpectedElementName); 01083 else 01084 emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); 01085 fReaderMgr.skipToChar(chOpenAngle); 01086 return false; 01087 } 01088 01089 // Assume it won't be an empty tag 01090 bool isEmpty = false; 01091 01092 // See if its the root element 01093 const bool isRoot = fElemStack.isEmpty(); 01094 01095 // Lets try to look up the element in the validator's element decl pool 01096 // We can pass bogus values for the URI id and the base name. We know that 01097 // this can only be called if we are doing a DTD style validator and that 01098 // he will only look at the QName. 01099 // 01100 // We *do not* tell him to fault in a decl if he does not find one - NG. 01101 bool wasAdded = false; 01102 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); 01103 01104 XMLElementDecl* elemDecl = fGrammar->getElemDecl 01105 ( 01106 fEmptyNamespaceId 01107 , 0 01108 , qnameRawBuf 01109 , Grammar::TOP_LEVEL_SCOPE 01110 ); 01111 // look in the undeclared pool: 01112 if(!elemDecl) 01113 { 01114 elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); 01115 } 01116 if(!elemDecl) 01117 { 01118 wasAdded = true; 01119 elemDecl = new (fMemoryManager) DTDElementDecl 01120 ( 01121 qnameRawBuf 01122 , fEmptyNamespaceId 01123 , DTDElementDecl::Any 01124 , fMemoryManager 01125 ); 01126 elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); 01127 } 01128 01129 if (fValidate) { 01130 01131 if (wasAdded) 01132 { 01133 // This is to tell the reuse Validator that this element was 01134 // faulted-in, was not an element in the validator pool originally 01135 elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); 01136 01137 fValidator->emitError 01138 ( 01139 XMLValid::ElementNotDefined 01140 , qnameRawBuf 01141 ); 01142 } 01143 // If its not marked declared, then emit an error 01144 else if (!elemDecl->isDeclared()) 01145 { 01146 fValidator->emitError 01147 ( 01148 XMLValid::ElementNotDefined 01149 , qnameRawBuf 01150 ); 01151 } 01152 01153 01154 fValidator->validateElement(elemDecl); 01155 } 01156 01157 // Expand the element stack and add the new element 01158 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); 01159 01160 // If this is the first element and we are validating, check the root 01161 // element. 01162 if (isRoot) 01163 { 01164 fRootGrammar = fGrammar; 01165 01166 if (fValidate) 01167 { 01168 // If a DocType exists, then check if it matches the root name there. 01169 if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) 01170 fValidator->emitError(XMLValid::RootElemNotLikeDocType); 01171 } 01172 } 01173 else if (fValidate) 01174 { 01175 // If the element stack is not empty, then add this element as a 01176 // child of the previous top element. If its empty, this is the root 01177 // elem and is not the child of anything. 01178 fElemStack.addChild(elemDecl->getElementName(), true); 01179 } 01180 01181 // Skip any whitespace after the name 01182 fReaderMgr.skipPastSpaces(); 01183 01184 // We loop until we either see a /> or >, handling attribute/value 01185 // pairs until we get there. 01186 XMLSize_t attCount = 0; 01187 XMLSize_t curAttListSize = fAttrList->size(); 01188 wasAdded = false; 01189 01190 fElemCount++; 01191 01192 while (true) 01193 { 01194 // And get the next non-space character 01195 XMLCh nextCh = fReaderMgr.peekNextChar(); 01196 01197 // If the next character is not a slash or closed angle bracket, 01198 // then it must be whitespace, since whitespace is required 01199 // between the end of the last attribute and the name of the next 01200 // one. 01201 if (attCount) 01202 { 01203 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) 01204 { 01205 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 01206 { 01207 // Ok, skip by them and peek another char 01208 fReaderMgr.skipPastSpaces(); 01209 nextCh = fReaderMgr.peekNextChar(); 01210 } 01211 else 01212 { 01213 // Emit the error but keep on going 01214 emitError(XMLErrs::ExpectedWhitespace); 01215 } 01216 } 01217 } 01218 01219 // Ok, here we first check for any of the special case characters. 01220 // If its not one, then we do the normal case processing, which 01221 // assumes that we've hit an attribute value, Otherwise, we do all 01222 // the special case checks. 01223 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) 01224 { 01225 // Assume its going to be an attribute, so get a name from 01226 // the input. 01227 01228 validName = fReaderMgr.getName(fAttNameBuf); 01229 if (!validName) 01230 { 01231 if (fAttNameBuf.isEmpty()) 01232 emitError(XMLErrs::ExpectedAttrName); 01233 else 01234 emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); 01235 fReaderMgr.skipPastChar(chCloseAngle); 01236 return false; 01237 } 01238 01239 // And next must be an equal sign 01240 if (!scanEq()) 01241 { 01242 static const XMLCh tmpList[] = 01243 { 01244 chSingleQuote, chDoubleQuote, chCloseAngle 01245 , chOpenAngle, chForwardSlash, chNull 01246 }; 01247 01248 emitError(XMLErrs::ExpectedEqSign); 01249 01250 // Try to sync back up by skipping forward until we either 01251 // hit something meaningful. 01252 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01253 01254 if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) 01255 { 01256 // Jump back to top for normal processing of these 01257 continue; 01258 } 01259 else if ((chFound == chSingleQuote) 01260 || (chFound == chDoubleQuote) 01261 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01262 { 01263 // Just fall through assuming that the value is to follow 01264 } 01265 else if (chFound == chOpenAngle) 01266 { 01267 // Assume a malformed tag and that new one is starting 01268 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01269 return false; 01270 } 01271 else 01272 { 01273 // Something went really wrong 01274 return false; 01275 } 01276 } 01277 01278 // See if this attribute is declared for this element. If we are 01279 // not validating of course it will not be at first, but we will 01280 // fault it into the pool (to avoid lots of redundant errors.) 01281 XMLCh * namePtr = fAttNameBuf.getRawBuffer(); 01282 XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); 01283 01284 // Skip any whitespace before the value and then scan the att 01285 // value. This will come back normalized with entity refs and 01286 // char refs expanded. 01287 fReaderMgr.skipPastSpaces(); 01288 if (!scanAttValue(attDef, namePtr, fAttValueBuf)) 01289 { 01290 static const XMLCh tmpList[] = 01291 { 01292 chCloseAngle, chOpenAngle, chForwardSlash, chNull 01293 }; 01294 01295 emitError(XMLErrs::ExpectedAttrValue); 01296 01297 // It failed, so lets try to get synced back up. We skip 01298 // forward until we find some whitespace or one of the 01299 // chars in our list. 01300 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01301 01302 if ((chFound == chCloseAngle) 01303 || (chFound == chForwardSlash) 01304 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01305 { 01306 // Just fall through and process this attribute, though 01307 // the value will be "". 01308 } 01309 else if (chFound == chOpenAngle) 01310 { 01311 // Assume a malformed tag and that new one is starting 01312 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01313 return false; 01314 } 01315 else 01316 { 01317 // Something went really wrong 01318 return false; 01319 } 01320 } 01321 01322 // Add this attribute to the attribute list that we use to 01323 // pass them to the handler. We reuse its existing elements 01324 // but expand it as required. 01325 // Note that we want to this first since this will 01326 // make a copy of the namePtr; we can then make use of 01327 // that copy in the hashtable lookup that checks 01328 // for duplicates. This will mean we may have to update 01329 // the type of the XMLAttr later. 01330 XMLAttr* curAtt; 01331 const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); 01332 01333 if (attCount >= curAttListSize) { 01334 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); 01335 fAttrList->addElement(curAtt); 01336 } 01337 else { 01338 curAtt = fAttrList->elementAt(attCount); 01339 } 01340 01341 curAtt->setSpecified(true); 01342 01343 // NO NAMESPACE CODE 01344 { 01345 curAtt->set( 01346 0, namePtr, XMLUni::fgZeroLenString, XMLUni::fgZeroLenString 01347 , (attDef)?attDef->getType():XMLAttDef::CData 01348 ); 01349 01350 // now need to prepare for duplicate detection 01351 if (attDef) { 01352 unsigned int *curCountPtr = fAttDefRegistry->get(attDef); 01353 if (!curCountPtr) { 01354 curCountPtr = getNewUIntPtr(); 01355 *curCountPtr = fElemCount; 01356 fAttDefRegistry->put(attDef, curCountPtr); 01357 } 01358 else if (*curCountPtr < fElemCount) { 01359 *curCountPtr = fElemCount; 01360 } 01361 else { 01362 emitError( 01363 XMLErrs::AttrAlreadyUsedInSTag 01364 , attDef->getFullName(), elemDecl->getFullName() 01365 ); 01366 } 01367 } 01368 else 01369 { 01370 // reset namePtr so it refers to newly-allocated memory 01371 namePtr = (XMLCh *)curAtt->getQName(); 01372 if (!fUndeclaredAttrRegistry->putIfNotPresent(namePtr, 0)) 01373 { 01374 emitError( 01375 XMLErrs::AttrAlreadyUsedInSTag 01376 , namePtr, elemDecl->getFullName() 01377 ); 01378 } 01379 } 01380 } 01381 01382 if (fValidate) 01383 { 01384 if (attDef) { 01385 // Let the validator pass judgement on the attribute value 01386 fValidator->validateAttrValue( 01387 attDef, fAttValueBuf.getRawBuffer(), false, elemDecl 01388 ); 01389 } 01390 else 01391 { 01392 fValidator->emitError 01393 ( 01394 XMLValid::AttNotDefinedForElement 01395 , fAttNameBuf.getRawBuffer(), qnameRawBuf 01396 ); 01397 } 01398 } 01399 01400 // must set the newly-minted value on the XMLAttr: 01401 curAtt->setValue(attrValue); 01402 attCount++; 01403 01404 // And jump back to the top of the loop 01405 continue; 01406 } 01407 01408 // It was some special case character so do all of the checks and 01409 // deal with it. 01410 if (!nextCh) 01411 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01412 01413 if (nextCh == chForwardSlash) 01414 { 01415 fReaderMgr.getNextChar(); 01416 isEmpty = true; 01417 if (!fReaderMgr.skippedChar(chCloseAngle)) 01418 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01419 break; 01420 } 01421 else if (nextCh == chCloseAngle) 01422 { 01423 fReaderMgr.getNextChar(); 01424 break; 01425 } 01426 else if (nextCh == chOpenAngle) 01427 { 01428 // Check for this one specially, since its going to be common 01429 // and it is kind of auto-recovering since we've already hit the 01430 // next open bracket, which is what we would have seeked to (and 01431 // skipped this whole tag.) 01432 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01433 break; 01434 } 01435 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) 01436 { 01437 // Check for this one specially, which is probably a missing 01438 // attribute name, e.g. ="value". Just issue expected name 01439 // error and eat the quoted string, then jump back to the 01440 // top again. 01441 emitError(XMLErrs::ExpectedAttrName); 01442 fReaderMgr.getNextChar(); 01443 fReaderMgr.skipQuotedString(nextCh); 01444 fReaderMgr.skipPastSpaces(); 01445 continue; 01446 } 01447 } 01448 01449 if(attCount) 01450 { 01451 // clean up after ourselves: 01452 // clear the map used to detect duplicate attributes 01453 fUndeclaredAttrRegistry->removeAll(); 01454 } 01455 01456 // Now lets get the fAttrList filled in. This involves faulting in any 01457 // defaulted and fixed attributes and normalizing the values of any that 01458 // we got explicitly. 01459 // 01460 // We update the attCount value with the total number of attributes, but 01461 // it goes in with the number of values we got during the raw scan of 01462 // explictly provided attrs above. 01463 attCount = buildAttList(attCount, elemDecl, *fAttrList); 01464 01465 // If we have a document handler, then tell it about this start tag. We 01466 // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send 01467 // any prefix since its just one big name if we are not doing namespaces. 01468 unsigned int uriId = fEmptyNamespaceId; 01469 if (fDocHandler) 01470 { 01471 fDocHandler->startElement 01472 ( 01473 *elemDecl 01474 , uriId 01475 , 0 01476 , *fAttrList 01477 , attCount 01478 , isEmpty 01479 , isRoot 01480 ); 01481 } 01482 01483 // If empty, validate content right now if we are validating and then 01484 // pop the element stack top. Else, we have to update the current stack 01485 // top's namespace mapping elements. 01486 if (isEmpty) 01487 { 01488 // If validating, then insure that its legal to have no content 01489 if (fValidate) 01490 { 01491 XMLSize_t failure; 01492 bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); 01493 if (!res) 01494 { 01495 fValidator->emitError 01496 ( 01497 XMLValid::ElementNotValidForContent 01498 , qnameRawBuf 01499 , elemDecl->getFormattedContentModel() 01500 ); 01501 } 01502 } 01503 01504 // Pop the element stack back off since it'll never be used now 01505 fElemStack.popTop(); 01506 01507 // If the elem stack is empty, then it was an empty root 01508 if (isRoot) 01509 gotData = false; 01510 } 01511 01512 return true; 01513 } 01514 01515 01516 bool DGXMLScanner::scanStartTagNS(bool& gotData) 01517 { 01518 // Assume we will still have data until proven otherwise. It will only 01519 // ever be false if this is the root and its empty. 01520 gotData = true; 01521 01522 // Get the QName. In this case, we are not doing namespaces, so we just 01523 // use it as is and don't have to break it into parts. 01524 01525 int colonPosition; 01526 bool validName = fReaderMgr.getQName(fQNameBuf, &colonPosition); 01527 if (!validName) 01528 { 01529 if (fQNameBuf.isEmpty()) 01530 emitError(XMLErrs::ExpectedElementName); 01531 else 01532 emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); 01533 fReaderMgr.skipToChar(chOpenAngle); 01534 return false; 01535 } 01536 01537 // Assume it won't be an empty tag 01538 bool isEmpty = false; 01539 01540 // See if its the root element 01541 const bool isRoot = fElemStack.isEmpty(); 01542 01543 // Lets try to look up the element in the validator's element decl pool 01544 // We can pass bogus values for the URI id and the base name. We know that 01545 // this can only be called if we are doing a DTD style validator and that 01546 // he will only look at the QName. 01547 // 01548 // We *do not* tell him to fault in a decl if he does not find one - NG. 01549 bool wasAdded = false; 01550 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); 01551 01552 XMLElementDecl* elemDecl = fGrammar->getElemDecl 01553 ( 01554 fEmptyNamespaceId 01555 , 0 01556 , qnameRawBuf 01557 , Grammar::TOP_LEVEL_SCOPE 01558 ); 01559 // look in the undeclared pool: 01560 if(!elemDecl) 01561 { 01562 elemDecl = fDTDElemNonDeclPool->getByKey(qnameRawBuf); 01563 } 01564 if(!elemDecl) 01565 { 01566 wasAdded = true; 01567 elemDecl = new (fMemoryManager) DTDElementDecl 01568 ( 01569 qnameRawBuf 01570 , fEmptyNamespaceId 01571 , DTDElementDecl::Any 01572 , fMemoryManager 01573 ); 01574 elemDecl->setId(fDTDElemNonDeclPool->put((DTDElementDecl*)elemDecl)); 01575 } 01576 01577 if (fValidate) { 01578 01579 if (wasAdded) 01580 { 01581 // This is to tell the reuse Validator that this element was 01582 // faulted-in, was not an element in the validator pool originally 01583 elemDecl->setCreateReason(XMLElementDecl::JustFaultIn); 01584 01585 fValidator->emitError 01586 ( 01587 XMLValid::ElementNotDefined 01588 , qnameRawBuf 01589 ); 01590 } 01591 // If its not marked declared, then emit an error 01592 else if (!elemDecl->isDeclared()) 01593 { 01594 fValidator->emitError 01595 ( 01596 XMLValid::ElementNotDefined 01597 , qnameRawBuf 01598 ); 01599 } 01600 01601 01602 fValidator->validateElement(elemDecl); 01603 } 01604 01605 // Expand the element stack and add the new element 01606 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); 01607 01608 // If this is the first element and we are validating, check the root 01609 // element. 01610 if (isRoot) 01611 { 01612 fRootGrammar = fGrammar; 01613 01614 if (fValidate) 01615 { 01616 // If a DocType exists, then check if it matches the root name there. 01617 if (fRootElemName && !XMLString::equals(qnameRawBuf, fRootElemName)) 01618 fValidator->emitError(XMLValid::RootElemNotLikeDocType); 01619 } 01620 } 01621 else if (fValidate) 01622 { 01623 // If the element stack is not empty, then add this element as a 01624 // child of the previous top element. If its empty, this is the root 01625 // elem and is not the child of anything. 01626 fElemStack.addChild(elemDecl->getElementName(), true); 01627 } 01628 01629 // Skip any whitespace after the name 01630 fReaderMgr.skipPastSpaces(); 01631 01632 // We loop until we either see a /> or >, handling attribute/value 01633 // pairs until we get there. 01634 XMLSize_t attCount = 0; 01635 XMLSize_t curAttListSize = fAttrList->size(); 01636 wasAdded = false; 01637 01638 fElemCount++; 01639 01640 while (true) 01641 { 01642 // And get the next non-space character 01643 XMLCh nextCh = fReaderMgr.peekNextChar(); 01644 01645 // If the next character is not a slash or closed angle bracket, 01646 // then it must be whitespace, since whitespace is required 01647 // between the end of the last attribute and the name of the next 01648 // one. 01649 if (attCount) 01650 { 01651 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) 01652 { 01653 if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 01654 { 01655 // Ok, skip by them and peek another char 01656 fReaderMgr.skipPastSpaces(); 01657 nextCh = fReaderMgr.peekNextChar(); 01658 } 01659 else 01660 { 01661 // Emit the error but keep on going 01662 emitError(XMLErrs::ExpectedWhitespace); 01663 } 01664 } 01665 } 01666 01667 // Ok, here we first check for any of the special case characters. 01668 // If its not one, then we do the normal case processing, which 01669 // assumes that we've hit an attribute value, Otherwise, we do all 01670 // the special case checks. 01671 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) 01672 { 01673 // Assume its going to be an attribute, so get a name from 01674 // the input. 01675 01676 validName = fReaderMgr.getQName(fAttNameBuf, &colonPosition); 01677 if (!validName) 01678 { 01679 if (fAttNameBuf.isEmpty()) 01680 emitError(XMLErrs::ExpectedAttrName); 01681 else 01682 emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); 01683 fReaderMgr.skipPastChar(chCloseAngle); 01684 return false; 01685 } 01686 01687 // And next must be an equal sign 01688 if (!scanEq()) 01689 { 01690 static const XMLCh tmpList[] = 01691 { 01692 chSingleQuote, chDoubleQuote, chCloseAngle 01693 , chOpenAngle, chForwardSlash, chNull 01694 }; 01695 01696 emitError(XMLErrs::ExpectedEqSign); 01697 01698 // Try to sync back up by skipping forward until we either 01699 // hit something meaningful. 01700 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01701 01702 if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) 01703 { 01704 // Jump back to top for normal processing of these 01705 continue; 01706 } 01707 else if ((chFound == chSingleQuote) 01708 || (chFound == chDoubleQuote) 01709 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01710 { 01711 // Just fall through assuming that the value is to follow 01712 } 01713 else if (chFound == chOpenAngle) 01714 { 01715 // Assume a malformed tag and that new one is starting 01716 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01717 return false; 01718 } 01719 else 01720 { 01721 // Something went really wrong 01722 return false; 01723 } 01724 } 01725 01726 // See if this attribute is declared for this element. If we are 01727 // not validating of course it will not be at first, but we will 01728 // fault it into the pool (to avoid lots of redundant errors.) 01729 XMLCh * namePtr = fAttNameBuf.getRawBuffer(); 01730 XMLAttDef* attDef = ((DTDElementDecl *)elemDecl)->getAttDef(namePtr); 01731 01732 // Skip any whitespace before the value and then scan the att 01733 // value. This will come back normalized with entity refs and 01734 // char refs expanded. 01735 fReaderMgr.skipPastSpaces(); 01736 if (!scanAttValue(attDef, namePtr, fAttValueBuf)) 01737 { 01738 static const XMLCh tmpList[] = 01739 { 01740 chCloseAngle, chOpenAngle, chForwardSlash, chNull 01741 }; 01742 01743 emitError(XMLErrs::ExpectedAttrValue); 01744 01745 // It failed, so lets try to get synced back up. We skip 01746 // forward until we find some whitespace or one of the 01747 // chars in our list. 01748 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01749 01750 if ((chFound == chCloseAngle) 01751 || (chFound == chForwardSlash) 01752 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01753 { 01754 // Just fall through and process this attribute, though 01755 // the value will be "". 01756 } 01757 else if (chFound == chOpenAngle) 01758 { 01759 // Assume a malformed tag and that new one is starting 01760 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01761 return false; 01762 } 01763 else 01764 { 01765 // Something went really wrong 01766 return false; 01767 } 01768 } 01769 01770 // Add this attribute to the attribute list that we use to 01771 // pass them to the handler. We reuse its existing elements 01772 // but expand it as required. 01773 // Note that we want to this first since this will 01774 // make a copy of the namePtr; we can then make use of 01775 // that copy in the hashtable lookup that checks 01776 // for duplicates. This will mean we may have to update 01777 // the type of the XMLAttr later. 01778 XMLAttr* curAtt; 01779 const XMLCh* attrValue = fAttValueBuf.getRawBuffer(); 01780 01781 if (attCount >= curAttListSize) { 01782 curAtt = new (fMemoryManager) XMLAttr(fMemoryManager); 01783 fAttrList->addElement(curAtt); 01784 } 01785 else { 01786 curAtt = fAttrList->elementAt(attCount); 01787 } 01788 01789 curAtt->setSpecified(true); 01790 // DO NAMESPACES 01791 { 01792 curAtt->set( 01793 fEmptyNamespaceId, namePtr, XMLUni::fgZeroLenString 01794 , (attDef)? attDef->getType() : XMLAttDef::CData 01795 ); 01796 01797 // each attribute has the prefix:suffix="value" 01798 const XMLCh* attPrefix = curAtt->getPrefix(); 01799 const XMLCh* attLocalName = curAtt->getName(); 01800 01801 if (attPrefix && *attPrefix) { 01802 if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) { 01803 curAtt->setURIId(fXMLNamespaceId); 01804 } 01805 else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) { 01806 curAtt->setURIId(fXMLNSNamespaceId); 01807 updateNSMap(attPrefix, attLocalName, attrValue); 01808 } 01809 else { 01810 fAttrNSList->addElement(curAtt); 01811 } 01812 } 01813 else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) 01814 { 01815 updateNSMap(attPrefix, XMLUni::fgZeroLenString, attrValue); 01816 } 01817 01818 // NOTE: duplicate attribute check will be done, when we map 01819 // namespaces to all attributes 01820 if (attDef) { 01821 unsigned int *curCountPtr = fAttDefRegistry->get(attDef); 01822 if (!curCountPtr) { 01823 curCountPtr = getNewUIntPtr(); 01824 *curCountPtr = fElemCount; 01825 fAttDefRegistry->put(attDef, curCountPtr); 01826 } 01827 else if (*curCountPtr < fElemCount) { 01828 *curCountPtr = fElemCount; 01829 } 01830 } 01831 } 01832 01833 if (fValidate) 01834 { 01835 if (attDef) { 01836 // Let the validator pass judgement on the attribute value 01837 fValidator->validateAttrValue( 01838 attDef, fAttValueBuf.getRawBuffer(), false, elemDecl 01839 ); 01840 } 01841 else 01842 { 01843 fValidator->emitError 01844 ( 01845 XMLValid::AttNotDefinedForElement 01846 , fAttNameBuf.getRawBuffer(), qnameRawBuf 01847 ); 01848 } 01849 } 01850 01851 // must set the newly-minted value on the XMLAttr: 01852 curAtt->setValue(attrValue); 01853 attCount++; 01854 01855 // And jump back to the top of the loop 01856 continue; 01857 } 01858 01859 // It was some special case character so do all of the checks and 01860 // deal with it. 01861 if (!nextCh) 01862 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01863 01864 if (nextCh == chForwardSlash) 01865 { 01866 fReaderMgr.getNextChar(); 01867 isEmpty = true; 01868 if (!fReaderMgr.skippedChar(chCloseAngle)) 01869 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01870 break; 01871 } 01872 else if (nextCh == chCloseAngle) 01873 { 01874 fReaderMgr.getNextChar(); 01875 break; 01876 } 01877 else if (nextCh == chOpenAngle) 01878 { 01879 // Check for this one specially, since its going to be common 01880 // and it is kind of auto-recovering since we've already hit the 01881 // next open bracket, which is what we would have seeked to (and 01882 // skipped this whole tag.) 01883 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01884 break; 01885 } 01886 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) 01887 { 01888 // Check for this one specially, which is probably a missing 01889 // attribute name, e.g. ="value". Just issue expected name 01890 // error and eat the quoted string, then jump back to the 01891 // top again. 01892 emitError(XMLErrs::ExpectedAttrName); 01893 fReaderMgr.getNextChar(); 01894 fReaderMgr.skipQuotedString(nextCh); 01895 fReaderMgr.skipPastSpaces(); 01896 continue; 01897 } 01898 } 01899 01900 // Make an initial pass through the list and find any xmlns attributes. 01901 if (attCount) 01902 scanAttrListforNameSpaces(fAttrList, attCount, elemDecl); 01903 01904 if(attCount) 01905 { 01906 // clean up after ourselves: 01907 // clear the map used to detect duplicate attributes 01908 fUndeclaredAttrRegistry->removeAll(); 01909 } 01910 01911 // Now lets get the fAttrList filled in. This involves faulting in any 01912 // defaulted and fixed attributes and normalizing the values of any that 01913 // we got explicitly. 01914 // 01915 // We update the attCount value with the total number of attributes, but 01916 // it goes in with the number of values we got during the raw scan of 01917 // explictly provided attrs above. 01918 attCount = buildAttList(attCount, elemDecl, *fAttrList); 01919 01920 // If we have a document handler, then tell it about this start tag. We 01921 // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send 01922 // any prefix since its just one big name if we are not doing namespaces. 01923 if (fDocHandler) 01924 { 01925 unsigned int uriId = resolvePrefix 01926 ( 01927 elemDecl->getElementName()->getPrefix() 01928 , ElemStack::Mode_Element 01929 ); 01930 01931 fDocHandler->startElement 01932 ( 01933 *elemDecl 01934 , uriId 01935 , elemDecl->getElementName()->getPrefix() 01936 , *fAttrList 01937 , attCount 01938 , isEmpty 01939 , isRoot 01940 ); 01941 } 01942 01943 // If empty, validate content right now if we are validating and then 01944 // pop the element stack top. Else, we have to update the current stack 01945 // top's namespace mapping elements. 01946 if (isEmpty) 01947 { 01948 // If validating, then insure that its legal to have no content 01949 if (fValidate) 01950 { 01951 XMLSize_t failure; 01952 bool res = fValidator->checkContent(elemDecl, 0, 0, &failure); 01953 if (!res) 01954 { 01955 fValidator->emitError 01956 ( 01957 XMLValid::ElementNotValidForContent 01958 , qnameRawBuf 01959 , elemDecl->getFormattedContentModel() 01960 ); 01961 } 01962 } 01963 01964 // Pop the element stack back off since it'll never be used now 01965 fElemStack.popTop(); 01966 01967 // If the elem stack is empty, then it was an empty root 01968 if (isRoot) 01969 gotData = false; 01970 } 01971 01972 return true; 01973 } 01974 01975 // --------------------------------------------------------------------------- 01976 // DGXMLScanner: Grammar preparsing 01977 // --------------------------------------------------------------------------- 01978 Grammar* DGXMLScanner::loadGrammar(const InputSource& src 01979 , const short grammarType 01980 , const bool toCache) 01981 { 01982 Grammar* loadedGrammar = 0; 01983 01984 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 01985 01986 try 01987 { 01988 fGrammarResolver->cacheGrammarFromParse(false); 01989 fGrammarResolver->useCachedGrammarInParse(false); 01990 fRootGrammar = 0; 01991 01992 if (fValScheme == Val_Auto) { 01993 fValidate = true; 01994 } 01995 01996 // Reset some status flags 01997 fInException = false; 01998 fStandalone = false; 01999 fErrorCount = 0; 02000 fHasNoDTD = true; 02001 02002 if (grammarType == Grammar::DTDGrammarType) { 02003 loadedGrammar = loadDTDGrammar(src, toCache); 02004 } 02005 } 02006 // NOTE: 02007 // 02008 // In all of the error processing below, the emitError() call MUST come 02009 // before the flush of the reader mgr, or it will fail because it tries 02010 // to find out the position in the XML source of the error. 02011 catch(const XMLErrs::Codes) 02012 { 02013 // This is a 'first failure' exception, so fall through 02014 } 02015 catch(const XMLValid::Codes) 02016 { 02017 // This is a 'first fatal error' type exit, so fall through 02018 } 02019 catch(const XMLException& excToCatch) 02020 { 02021 // Emit the error and catch any user exception thrown from here. Make 02022 // sure in all cases we flush the reader manager. 02023 fInException = true; 02024 try 02025 { 02026 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 02027 emitError 02028 ( 02029 XMLErrs::XMLException_Warning 02030 , excToCatch.getCode() 02031 , excToCatch.getMessage() 02032 ); 02033 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 02034 emitError 02035 ( 02036 XMLErrs::XMLException_Fatal 02037 , excToCatch.getCode() 02038 , excToCatch.getMessage() 02039 ); 02040 else 02041 emitError 02042 ( 02043 XMLErrs::XMLException_Error 02044 , excToCatch.getCode() 02045 , excToCatch.getMessage() 02046 ); 02047 } 02048 catch(const OutOfMemoryException&) 02049 { 02050 // This is a special case for out-of-memory 02051 // conditions, because resetting the ReaderMgr 02052 // can be problematic. 02053 resetReaderMgr.release(); 02054 02055 throw; 02056 } 02057 } 02058 catch(const OutOfMemoryException&) 02059 { 02060 // This is a special case for out-of-memory 02061 // conditions, because resetting the ReaderMgr 02062 // can be problematic. 02063 resetReaderMgr.release(); 02064 02065 throw; 02066 } 02067 02068 return loadedGrammar; 02069 } 02070 02071 Grammar* DGXMLScanner::loadDTDGrammar(const InputSource& src, 02072 const bool toCache) 02073 { 02074 // Reset the validators 02075 fDTDValidator->reset(); 02076 if (fValidatorFromUser) 02077 fValidator->reset(); 02078 02079 fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); 02080 fGrammarResolver->putGrammar(fDTDGrammar); 02081 fGrammar = fDTDGrammar; 02082 fValidator->setGrammar(fGrammar); 02083 02084 // And for all installed handlers, send reset events. This gives them 02085 // a chance to flush any cached data. 02086 if (fDocHandler) 02087 fDocHandler->resetDocument(); 02088 if (fEntityHandler) 02089 fEntityHandler->resetEntities(); 02090 if (fErrorReporter) 02091 fErrorReporter->resetErrors(); 02092 02093 // Clear out the id reference list 02094 resetValidationContext(); 02095 02096 if (toCache) { 02097 02098 unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId()); 02099 const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId); 02100 02101 fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString); 02102 ((XMLDTDDescription*) (fGrammar->getGrammarDescription()))->setSystemId(sysIdStr); 02103 fGrammarResolver->putGrammar(fGrammar); 02104 } 02105 02106 // Handle the creation of the XML reader object for this input source. 02107 // This will provide us with transcoding and basic lexing services. 02108 XMLReader* newReader = fReaderMgr.createReader 02109 ( 02110 src 02111 , false 02112 , XMLReader::RefFrom_NonLiteral 02113 , XMLReader::Type_General 02114 , XMLReader::Source_External 02115 , fCalculateSrcOfs 02116 , fLowWaterMark 02117 ); 02118 if (!newReader) { 02119 if (src.getIssueFatalErrorIfNotFound()) 02120 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); 02121 else 02122 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); 02123 } 02124 02125 // In order to make the processing work consistently, we have to 02126 // make this look like an external entity. So create an entity 02127 // decl and fill it in and push it with the reader, as happens 02128 // with an external entity. Put a janitor on it to insure it gets 02129 // cleaned up. The reader manager does not adopt them. 02130 const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull }; 02131 DTDEntityDecl* declDTD = new (fMemoryManager) DTDEntityDecl(gDTDStr, false, fMemoryManager); 02132 declDTD->setSystemId(src.getSystemId()); 02133 declDTD->setIsExternal(true); 02134 Janitor<DTDEntityDecl> janDecl(declDTD); 02135 02136 // Mark this one as a throw at end 02137 newReader->setThrowAtEnd(true); 02138 02139 // And push it onto the stack, with its pseudo name 02140 fReaderMgr.pushReader(newReader, declDTD); 02141 02142 // If we have a doc type handler and advanced callbacks are enabled, 02143 // call the doctype event. 02144 if (fDocTypeHandler) { 02145 02146 // Create a dummy root 02147 DTDElementDecl* rootDecl = new (fGrammarPoolMemoryManager) DTDElementDecl 02148 ( 02149 gDTDStr 02150 , fEmptyNamespaceId 02151 , DTDElementDecl::Any 02152 , fGrammarPoolMemoryManager 02153 ); 02154 rootDecl->setCreateReason(DTDElementDecl::AsRootElem); 02155 rootDecl->setExternalElemDeclaration(true); 02156 Janitor<DTDElementDecl> janSrc(rootDecl); 02157 02158 fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false, true); 02159 } 02160 02161 // Create DTDScanner 02162 DTDScanner dtdScanner 02163 ( 02164 (DTDGrammar*)fGrammar 02165 , fDocTypeHandler 02166 , fGrammarPoolMemoryManager 02167 , fMemoryManager 02168 ); 02169 dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr); 02170 02171 // Tell it its not in an include section 02172 dtdScanner.scanExtSubsetDecl(false, true); 02173 02174 if (fValidate) { 02175 // validate the DTD scan so far 02176 fValidator->preContentValidation(false, true); 02177 } 02178 02179 if (toCache) 02180 fGrammarResolver->cacheGrammars(); 02181 02182 return fDTDGrammar; 02183 } 02184 02185 02186 // --------------------------------------------------------------------------- 02187 // DGXMLScanner: Private helper methods 02188 // --------------------------------------------------------------------------- 02189 // This method handles the common initialization, to avoid having to do 02190 // it redundantly in multiple constructors. 02191 void DGXMLScanner::commonInit() 02192 { 02193 // And we need one for the raw attribute scan. This just stores key/ 02194 // value string pairs (prior to any processing.) 02195 fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager); 02196 02197 // Create the Validator and init them 02198 fDTDValidator = new (fMemoryManager) DTDValidator(); 02199 initValidator(fDTDValidator); 02200 fDTDElemNonDeclPool = new (fMemoryManager) NameIdPool<DTDElementDecl>(29, 128, fMemoryManager); 02201 fAttDefRegistry = new (fMemoryManager) RefHashTableOf<unsigned int, PtrHasher> 02202 ( 02203 131, false, fMemoryManager 02204 ); 02205 fUndeclaredAttrRegistry = new (fMemoryManager) Hash2KeysSetOf<StringHasher>(7, fMemoryManager); 02206 02207 if (fValidator) 02208 { 02209 if (!fValidator->handlesDTD()) 02210 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Gen_NoDTDValidator, fMemoryManager); 02211 } 02212 else 02213 { 02214 fValidator = fDTDValidator; 02215 } 02216 } 02217 02218 void DGXMLScanner::cleanUp() 02219 { 02220 delete fAttrNSList; 02221 delete fDTDValidator; 02222 delete fDTDElemNonDeclPool; 02223 delete fAttDefRegistry; 02224 delete fUndeclaredAttrRegistry; 02225 } 02226 02227 02228 // This method is called from scanStartTagNS() to build up the list of 02229 // XMLAttr objects that will be passed out in the start tag callout. We 02230 // get the key/value pairs from the raw scan of explicitly provided attrs, 02231 // which have not been normalized. And we get the element declaration from 02232 // which we will get any defaulted or fixed attribute defs and add those 02233 // in as well. 02234 XMLSize_t 02235 DGXMLScanner::buildAttList(const XMLSize_t attCount 02236 , XMLElementDecl* elemDecl 02237 , RefVectorOf<XMLAttr>& toFill) 02238 { 02239 // Ask the element to clear the 'provided' flag on all of the att defs 02240 // that it owns, and to return us a boolean indicating whether it has 02241 // any defs. 02242 const bool hasDefs = elemDecl->hasAttDefs(); 02243 02244 // If there are no expliclitily provided attributes and there are no 02245 // defined attributes for the element, the we don't have anything to do. 02246 // So just return zero in this case. 02247 if (!hasDefs && !attCount) 02248 return 0; 02249 02250 // Keep up with how many attrs we end up with total 02251 XMLSize_t retCount = attCount; 02252 02253 // And get the current size of the output vector. This lets us use 02254 // existing elements until we fill it, then start adding new ones. 02255 const XMLSize_t curAttListSize = toFill.size(); 02256 02257 // Ok, so lets get an enumerator for the attributes of this element 02258 // and run through them for well formedness and validity checks. But 02259 // make sure that we had any attributes before we do it, since the list 02260 // would have have gotten faulted in anyway. 02261 if (hasDefs) 02262 { 02263 XMLAttDefList& attDefList = elemDecl->getAttDefList(); 02264 for(XMLSize_t i=0; i<attDefList.getAttDefCount(); i++) 02265 { 02266 // Get the current att def, for convenience and its def type 02267 XMLAttDef& curDef = attDefList.getAttDef(i); 02268 02269 unsigned int *attCountPtr = fAttDefRegistry->get(&curDef); 02270 if (!attCountPtr || *attCountPtr < fElemCount) 02271 { // did not occur 02272 const XMLAttDef::DefAttTypes defType = curDef.getDefaultType(); 02273 02274 if (fValidate) 02275 { 02276 // If we are validating and its required, then an error 02277 if (defType == XMLAttDef::Required) 02278 { 02279 fValidator->emitError 02280 ( 02281 XMLValid::RequiredAttrNotProvided 02282 , curDef.getFullName() 02283 ); 02284 } 02285 else if ((defType == XMLAttDef::Default) || 02286 (defType == XMLAttDef::Fixed) ) 02287 { 02288 if (fStandalone && curDef.isExternal()) 02289 { 02290 // XML 1.0 Section 2.9 02291 // Document is standalone, so attributes must not be defaulted. 02292 fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName()); 02293 } 02294 } 02295 } 02296 02297 // Fault in the value if needed, and bump the att count 02298 if ((defType == XMLAttDef::Default) 02299 || (defType == XMLAttDef::Fixed)) 02300 { 02301 // Let the validator pass judgement on the attribute value 02302 if (fValidate) 02303 { 02304 fValidator->validateAttrValue 02305 ( 02306 &curDef 02307 , curDef.getValue() 02308 , false 02309 , elemDecl 02310 ); 02311 } 02312 02313 XMLAttr* curAtt; 02314 if (retCount >= curAttListSize) 02315 { 02316 if (fDoNamespaces) 02317 { 02318 curAtt = new (fMemoryManager) XMLAttr 02319 ( 02320 fEmptyNamespaceId 02321 , curDef.getFullName() 02322 , curDef.getValue() 02323 , curDef.getType() 02324 , false 02325 , fMemoryManager 02326 ); 02327 } 02328 else 02329 { 02330 curAtt = new (fMemoryManager) XMLAttr 02331 ( 02332 0 02333 , curDef.getFullName() 02334 , XMLUni::fgZeroLenString 02335 , curDef.getValue() 02336 , curDef.getType() 02337 , false 02338 , fMemoryManager 02339 ); 02340 } 02341 02342 fAttrList->addElement(curAtt); 02343 } 02344 else 02345 { 02346 curAtt = fAttrList->elementAt(retCount); 02347 if (fDoNamespaces) 02348 { 02349 curAtt->set 02350 ( 02351 fEmptyNamespaceId 02352 , curDef.getFullName() 02353 , curDef.getValue() 02354 , curDef.getType() 02355 ); 02356 } 02357 else 02358 { 02359 curAtt->set 02360 ( 02361 0 02362 , curDef.getFullName() 02363 , XMLUni::fgZeroLenString 02364 , curDef.getValue() 02365 , curDef.getType() 02366 ); 02367 } 02368 curAtt->setSpecified(false); 02369 } 02370 02371 if (fDoNamespaces) 02372 { 02373 // Map the new attribute's prefix to a URI id and store 02374 // that in the attribute object. 02375 const XMLCh* attPrefix = curAtt->getPrefix(); 02376 if (attPrefix && *attPrefix) { 02377 curAtt->setURIId 02378 ( 02379 resolvePrefix(attPrefix, ElemStack::Mode_Attribute) 02380 ); 02381 } 02382 } 02383 02384 retCount++; 02385 } 02386 } 02387 } 02388 } 02389 02390 return retCount; 02391 } 02392 02393 02394 // This method will reset the scanner data structures, and related plugged 02395 // in stuff, for a new scan session. We get the input source for the primary 02396 // XML entity, create the reader for it, and push it on the stack so that 02397 // upon successful return from here we are ready to go. 02398 void DGXMLScanner::scanReset(const InputSource& src) 02399 { 02400 02401 // This call implicitly tells us that we are going to reuse the scanner 02402 // if it was previously used. So tell the validator to reset itself. 02403 // 02404 // But, if the fUseCacheGrammar flag is set, then don't reset it. 02405 // 02406 // NOTE: The ReaderMgr is flushed on the way out, because that is 02407 // required to insure that files are closed. 02408 fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar); 02409 fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar); 02410 02411 fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager); 02412 fGrammarResolver->putGrammar(fDTDGrammar); 02413 fGrammar = fDTDGrammar; 02414 fRootGrammar = 0; 02415 fValidator->setGrammar(fGrammar); 02416 02417 // Reset validation 02418 fValidate = (fValScheme == Val_Always) ? true : false; 02419 02420 // And for all installed handlers, send reset events. This gives them 02421 // a chance to flush any cached data. 02422 if (fDocHandler) 02423 fDocHandler->resetDocument(); 02424 if (fEntityHandler) 02425 fEntityHandler->resetEntities(); 02426 if (fErrorReporter) 02427 fErrorReporter->resetErrors(); 02428 02429 // Clear out the id reference list 02430 resetValidationContext(); 02431 02432 // Reset the Root Element Name 02433 fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName; 02434 fRootElemName = 0; 02435 02436 // Reset the element stack, and give it the latest ids for the special 02437 // URIs it has to know about. 02438 fElemStack.reset 02439 ( 02440 fEmptyNamespaceId 02441 , fUnknownNamespaceId 02442 , fXMLNamespaceId 02443 , fXMLNSNamespaceId 02444 ); 02445 02446 // Reset some status flags 02447 fInException = false; 02448 fStandalone = false; 02449 fErrorCount = 0; 02450 fHasNoDTD = true; 02451 02452 // Reset the validators 02453 fDTDValidator->reset(); 02454 fDTDValidator->setErrorReporter(fErrorReporter); 02455 if (fValidatorFromUser) 02456 fValidator->reset(); 02457 02458 // Handle the creation of the XML reader object for this input source. 02459 // This will provide us with transcoding and basic lexing services. 02460 XMLReader* newReader = fReaderMgr.createReader 02461 ( 02462 src 02463 , true 02464 , XMLReader::RefFrom_NonLiteral 02465 , XMLReader::Type_General 02466 , XMLReader::Source_External 02467 , fCalculateSrcOfs 02468 , fLowWaterMark 02469 ); 02470 02471 if (!newReader) { 02472 if (src.getIssueFatalErrorIfNotFound()) 02473 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); 02474 else 02475 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); 02476 } 02477 02478 // Push this read onto the reader manager 02479 fReaderMgr.pushReader(newReader, 0); 02480 02481 // and reset security-related things if necessary: 02482 if(fSecurityManager != 0) 02483 { 02484 fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit(); 02485 fEntityExpansionCount = 0; 02486 } 02487 if(fUIntPoolRowTotal >= 32) 02488 { // 8 KB tied up with validating attributes... 02489 fAttDefRegistry->removeAll(); 02490 recreateUIntPool(); 02491 } 02492 else 02493 { 02494 // note that this will implicitly reset the values of the hashtables, 02495 // though their buckets will still be tied up 02496 resetUIntPool(); 02497 } 02498 fUndeclaredAttrRegistry->removeAll(); 02499 fAttrNSList->removeAllElements(); 02500 } 02501 02502 02503 // This method is called between markup in content. It scans for character 02504 // data that is sent to the document handler. It watches for any markup 02505 // characters that would indicate that the character data has ended. It also 02506 // handles expansion of general and character entities. 02507 // 02508 // sendData() is a local static helper for this method which handles some 02509 // code that must be done in three different places here. 02510 void DGXMLScanner::sendCharData(XMLBuffer& toSend) 02511 { 02512 // If no data in the buffer, then nothing to do 02513 if (toSend.isEmpty()) 02514 return; 02515 02516 // We do different things according to whether we are validating or 02517 // not. If not, its always just characters; else, it depends on the 02518 // current element's content model. 02519 if (fValidate) 02520 { 02521 // Get the raw data we need for the callback 02522 const XMLCh* const rawBuf = toSend.getRawBuffer(); 02523 const XMLSize_t len = toSend.getLen(); 02524 02525 // And see if the current element is a 'Children' style content model 02526 const ElemStack::StackElem* topElem = fElemStack.topElement(); 02527 02528 // Get the character data opts for the current element 02529 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); 02530 02531 if (charOpts == XMLElementDecl::NoCharData) 02532 { 02533 // They definitely cannot handle any type of char data 02534 fValidator->emitError(XMLValid::NoCharDataInCM); 02535 } 02536 else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len)) 02537 { 02538 // Its all spaces. So, if they can take spaces, then send it 02539 // as ignorable whitespace. If they can handle any char data 02540 // send it as characters. 02541 if (charOpts == XMLElementDecl::SpacesOk) { 02542 if (fDocHandler) 02543 fDocHandler->ignorableWhitespace(rawBuf, len, false); 02544 } 02545 else if (charOpts == XMLElementDecl::AllCharData) 02546 { 02547 if (fDocHandler) 02548 fDocHandler->docCharacters(rawBuf, len, false); 02549 } 02550 } 02551 else 02552 { 02553 // If they can take any char data, then send it. Otherwise, they 02554 // can only handle whitespace and can't handle this stuff so 02555 // issue an error. 02556 if (charOpts == XMLElementDecl::AllCharData) 02557 { 02558 if (fDocHandler) 02559 fDocHandler->docCharacters(rawBuf, len, false); 02560 } 02561 else 02562 { 02563 fValidator->emitError(XMLValid::NoCharDataInCM); 02564 } 02565 } 02566 } 02567 else 02568 { 02569 // Always assume its just char data if not validating 02570 if (fDocHandler) 02571 fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false); 02572 } 02573 02574 // Reset buffer 02575 toSend.reset(); 02576 } 02577 02578 02579 02580 // This method is called with a key/value string pair that represents an 02581 // xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the 02582 // current top of the element stack based on this data. We know that when 02583 // we get here, that it is one of these forms, so we don't bother confirming 02584 // it. 02585 // 02586 // But we have to ensure 02587 // 1. xxx is not xmlns 02588 // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa 02589 // 3. yyy is not XMLUni::fgXMLNSURIName 02590 // 4. if xxx is not null, then yyy cannot be an empty string. 02591 void DGXMLScanner::updateNSMap(const XMLCh* const attrPrefix 02592 , const XMLCh* const attrLocalName 02593 , const XMLCh* const attrValue) 02594 { 02595 // We either have the default prefix (""), or we point it into the attr 02596 // name parameter. Note that the xmlns is not the prefix we care about 02597 // here. To us, the 'prefix' is really the local part of the attrName 02598 // parameter. 02599 // 02600 // Check 1. xxx is not xmlns 02601 // 2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa 02602 // 3. yyy is not XMLUni::fgXMLNSURIName 02603 // 4. if xxx is not null, then yyy cannot be an empty string. 02604 if (attrPrefix && *attrPrefix) { 02605 02606 if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString)) 02607 emitError(XMLErrs::NoUseOfxmlnsAsPrefix); 02608 else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) { 02609 if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName)) 02610 emitError(XMLErrs::PrefixXMLNotMatchXMLURI); 02611 } 02612 02613 if (!attrValue) 02614 emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); 02615 else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0) 02616 emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName); 02617 } 02618 02619 if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName)) 02620 emitError(XMLErrs::NoUseOfxmlnsURI); 02621 else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) { 02622 if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString)) 02623 emitError(XMLErrs::XMLURINotMatchXMLPrefix); 02624 } 02625 02626 // Ok, we have to get the unique id for the attribute value, which is the 02627 // URI that this value should be mapped to. The validator has the 02628 // namespace string pool, so we ask him to find or add this new one. Then 02629 // we ask the element stack to add this prefix to URI Id mapping. 02630 fElemStack.addPrefix 02631 ( 02632 attrLocalName 02633 , fURIStringPool->addOrFind(attrValue) 02634 ); 02635 } 02636 02637 void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf<XMLAttr>* theAttrList, XMLSize_t attCount, 02638 XMLElementDecl* elemDecl) 02639 { 02640 // Map prefixes to uris 02641 for (XMLSize_t i=0; i < fAttrNSList->size(); i++) { 02642 XMLAttr* providedAttr = fAttrNSList->elementAt(i); 02643 providedAttr->setURIId( 02644 resolvePrefix(providedAttr->getPrefix(), ElemStack::Mode_Attribute) 02645 ); 02646 } 02647 02648 fAttrNSList->removeAllElements(); 02649 02650 // Decide if to use hash table to do duplicate checking 02651 bool toUseHashTable = false; 02652 02653 setAttrDupChkRegistry(attCount, toUseHashTable); 02654 for (XMLSize_t index = 0; index < attCount; index++) 02655 { 02656 // check for duplicate namespace attributes: 02657 // by checking for qualified names with the same local part and with prefixes 02658 // which have been bound to namespace names that are identical. 02659 XMLAttr* curAttr = theAttrList->elementAt(index); 02660 if (!toUseHashTable) 02661 { 02662 XMLAttr* loopAttr; 02663 for (XMLSize_t attrIndex=0; attrIndex < index; attrIndex++) { 02664 loopAttr = theAttrList->elementAt(attrIndex); 02665 if (loopAttr->getURIId() == curAttr->getURIId() && 02666 XMLString::equals(loopAttr->getName(), curAttr->getName())) { 02667 emitError( 02668 XMLErrs::AttrAlreadyUsedInSTag, curAttr->getName() 02669 , elemDecl->getFullName() 02670 ); 02671 } 02672 } 02673 } 02674 else 02675 { 02676 if (fAttrDupChkRegistry->containsKey((void*)curAttr->getName(), curAttr->getURIId())) 02677 { 02678 emitError( 02679 XMLErrs::AttrAlreadyUsedInSTag 02680 , curAttr->getName(), elemDecl->getFullName() 02681 ); 02682 } 02683 02684 fAttrDupChkRegistry->put((void*)curAttr->getName(), curAttr->getURIId(), curAttr); 02685 } 02686 } 02687 } 02688 02689 InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId 02690 ,const XMLCh* const pubId) 02691 { 02692 //Normalize sysId 02693 XMLBufBid nnSys(&fBufMgr); 02694 XMLBuffer& normalizedSysId = nnSys.getBuffer(); 02695 XMLString::removeChar(sysId, 0xFFFF, normalizedSysId); 02696 const XMLCh* normalizedURI = normalizedSysId.getRawBuffer(); 02697 02698 // Create a buffer for expanding the normalized system id 02699 XMLBufBid bbSys(&fBufMgr); 02700 XMLBuffer& expSysId = bbSys.getBuffer(); 02701 02702 // Allow the entity handler to expand the system id if they choose 02703 // to do so. 02704 InputSource* srcToFill = 0; 02705 if (fEntityHandler) 02706 { 02707 if (!fEntityHandler->expandSystemId(normalizedURI, expSysId)) 02708 expSysId.set(normalizedURI); 02709 02710 ReaderMgr::LastExtEntityInfo lastInfo; 02711 fReaderMgr.getLastExtEntityInfo(lastInfo); 02712 XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity, 02713 expSysId.getRawBuffer(), 0, pubId, lastInfo.systemId, 02714 &fReaderMgr); 02715 srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier); 02716 } 02717 else 02718 { 02719 expSysId.set(normalizedURI); 02720 } 02721 02722 // If they didn't create a source via the entity handler, then we 02723 // have to create one on our own. 02724 if (!srcToFill) 02725 { 02726 if (fDisableDefaultEntityResolution) 02727 return srcToFill; 02728 02729 ReaderMgr::LastExtEntityInfo lastInfo; 02730 fReaderMgr.getLastExtEntityInfo(lastInfo); 02731 02732 XMLURL urlTmp(fMemoryManager); 02733 if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) || 02734 (urlTmp.isRelative())) 02735 { 02736 if (!fStandardUriConformant) 02737 { 02738 XMLBufBid ddSys(&fBufMgr); 02739 XMLBuffer& resolvedSysId = ddSys.getBuffer(); 02740 XMLUri::normalizeURI(expSysId.getRawBuffer(), resolvedSysId); 02741 02742 srcToFill = new (fMemoryManager) LocalFileInputSource 02743 ( 02744 lastInfo.systemId 02745 , resolvedSysId.getRawBuffer() 02746 , fMemoryManager 02747 ); 02748 } 02749 else 02750 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); 02751 } 02752 else 02753 { 02754 if (fStandardUriConformant && urlTmp.hasInvalidChar()) 02755 ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager); 02756 srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager); 02757 } 02758 } 02759 02760 return srcToFill; 02761 } 02762 02763 // --------------------------------------------------------------------------- 02764 // DGXMLScanner: Private parsing methods 02765 // --------------------------------------------------------------------------- 02766 bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef 02767 , const XMLCh *const attrName 02768 , XMLBuffer& toFill) 02769 { 02770 enum States 02771 { 02772 InWhitespace 02773 , InContent 02774 }; 02775 02776 // Get the type and name 02777 const XMLAttDef::AttTypes type = (attDef) 02778 ?attDef->getType() 02779 :XMLAttDef::CData; 02780 02781 // Reset the target buffer 02782 toFill.reset(); 02783 02784 // Get the next char which must be a single or double quote 02785 XMLCh quoteCh; 02786 if (!fReaderMgr.skipIfQuote(quoteCh)) 02787 return false; 02788 02789 // We have to get the current reader because we have to ignore closing 02790 // quotes until we hit the same reader again. 02791 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); 02792 02793 // Get attribute def - to check to see if it's declared externally or not 02794 bool isAttExternal = (attDef) 02795 ?attDef->isExternal() 02796 :false; 02797 02798 // Loop until we get the attribute value. Note that we use a double 02799 // loop here to avoid the setup/teardown overhead of the exception 02800 // handler on every round. 02801 XMLCh nextCh; 02802 XMLCh secondCh = 0; 02803 States curState = InContent; 02804 bool firstNonWS = false; 02805 bool gotLeadingSurrogate = false; 02806 bool escaped; 02807 while (true) 02808 { 02809 try 02810 { 02811 while(true) 02812 { 02813 nextCh = fReaderMgr.getNextChar(); 02814 02815 if (!nextCh) 02816 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 02817 02818 // Check for our ending quote in the same entity 02819 if (nextCh == quoteCh) 02820 { 02821 if (curReader == fReaderMgr.getCurrentReaderNum()) 02822 return true; 02823 02824 // Watch for spillover into a previous entity 02825 if (curReader > fReaderMgr.getCurrentReaderNum()) 02826 { 02827 emitError(XMLErrs::PartialMarkupInEntity); 02828 return false; 02829 } 02830 } 02831 02832 // Check for an entity ref now, before we let it affect our 02833 // whitespace normalization logic below. We ignore the empty flag 02834 // in this one. 02835 escaped = false; 02836 if (nextCh == chAmpersand) 02837 { 02838 if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned) 02839 { 02840 gotLeadingSurrogate = false; 02841 continue; 02842 } 02843 } 02844 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 02845 { 02846 // Deal with surrogate pairs 02847 // Its a leading surrogate. If we already got one, then 02848 // issue an error, else set leading flag to make sure that 02849 // we look for a trailing next time. 02850 if (gotLeadingSurrogate) 02851 emitError(XMLErrs::Expected2ndSurrogateChar); 02852 else 02853 gotLeadingSurrogate = true; 02854 } 02855 else 02856 { 02857 // If its a trailing surrogate, make sure that we are 02858 // prepared for that. Else, its just a regular char so make 02859 // sure that we were not expected a trailing surrogate. 02860 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 02861 { 02862 // Its trailing, so make sure we were expecting it 02863 if (!gotLeadingSurrogate) 02864 emitError(XMLErrs::Unexpected2ndSurrogateChar); 02865 } 02866 else 02867 { 02868 // Its just a char, so make sure we were not expecting a 02869 // trailing surrogate. 02870 if (gotLeadingSurrogate) 02871 emitError(XMLErrs::Expected2ndSurrogateChar); 02872 02873 // Its got to at least be a valid XML character 02874 if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 02875 { 02876 XMLCh tmpBuf[9]; 02877 XMLString::binToText 02878 ( 02879 nextCh 02880 , tmpBuf 02881 , 8 02882 , 16 02883 , fMemoryManager 02884 ); 02885 emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); 02886 } 02887 } 02888 gotLeadingSurrogate = false; 02889 } 02890 02891 // If its not escaped, then make sure its not a < character, which 02892 // is not allowed in attribute values. 02893 if (!escaped && (nextCh == chOpenAngle)) 02894 emitError(XMLErrs::BracketInAttrValue, attrName); 02895 02896 // If the attribute is a CDATA type we do simple replacement of 02897 // tabs and new lines with spaces, if the character is not escaped 02898 // by way of a char ref. 02899 // 02900 // Otherwise, we do the standard non-CDATA normalization of 02901 // compressing whitespace to single spaces and getting rid of leading 02902 // and trailing whitespace. 02903 if (type == XMLAttDef::CData) 02904 { 02905 if (!escaped) 02906 { 02907 if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D)) 02908 { 02909 // Check Validity Constraint for Standalone document declaration 02910 // XML 1.0, Section 2.9 02911 if (fStandalone && fValidate && isAttExternal) 02912 { 02913 // Can't have a standalone document declaration of "yes" if attribute 02914 // values are subject to normalisation 02915 fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); 02916 } 02917 nextCh = chSpace; 02918 } 02919 } 02920 } 02921 else 02922 { 02923 if (curState == InWhitespace) 02924 { 02925 if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 02926 { 02927 if (firstNonWS) 02928 toFill.append(chSpace); 02929 curState = InContent; 02930 firstNonWS = true; 02931 } 02932 else 02933 { 02934 continue; 02935 } 02936 } 02937 else if (curState == InContent) 02938 { 02939 if ((nextCh == chSpace) || 02940 (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped)) 02941 { 02942 curState = InWhitespace; 02943 02944 // Check Validity Constraint for Standalone document declaration 02945 // XML 1.0, Section 2.9 02946 if (fStandalone && fValidate && isAttExternal) 02947 { 02948 if (!firstNonWS || (nextCh != chSpace) || (fReaderMgr.lookingAtSpace())) 02949 { 02950 // Can't have a standalone document declaration of "yes" if attribute 02951 // values are subject to normalisation 02952 fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName); 02953 } 02954 } 02955 continue; 02956 } 02957 firstNonWS = true; 02958 } 02959 } 02960 02961 // Else add it to the buffer 02962 toFill.append(nextCh); 02963 02964 if (secondCh) 02965 { 02966 toFill.append(secondCh); 02967 secondCh=0; 02968 } 02969 } 02970 } 02971 catch(const EndOfEntityException&) 02972 { 02973 // Just eat it and continue. 02974 gotLeadingSurrogate = false; 02975 escaped = false; 02976 } 02977 } 02978 return true; 02979 } 02980 02981 02982 // This method scans a CDATA section. It collects the character into one 02983 // of the temp buffers and calls the document handler, if any, with the 02984 // characters. It assumes that the <![CDATA string has been scanned before 02985 // this call. 02986 void DGXMLScanner::scanCDSection() 02987 { 02988 static const XMLCh CDataClose[] = 02989 { 02990 chCloseSquare, chCloseAngle, chNull 02991 }; 02992 02993 // The next character should be the opening square bracket. If not 02994 // issue an error, but then try to recover by skipping any whitespace 02995 // and checking again. 02996 if (!fReaderMgr.skippedChar(chOpenSquare)) 02997 { 02998 emitError(XMLErrs::ExpectedOpenSquareBracket); 02999 fReaderMgr.skipPastSpaces(); 03000 03001 // If we still don't find it, then give up, else keep going 03002 if (!fReaderMgr.skippedChar(chOpenSquare)) 03003 return; 03004 } 03005 03006 // Get a buffer for this 03007 XMLBufBid bbCData(&fBufMgr); 03008 03009 // We just scan forward until we hit the end of CDATA section sequence. 03010 // CDATA is effectively a big escape mechanism so we don't treat markup 03011 // characters specially here. 03012 bool emittedError = false; 03013 bool gotLeadingSurrogate = false; 03014 03015 // Get the character data opts for the current element 03016 const ElemStack::StackElem* topElem = fElemStack.topElement(); 03017 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); 03018 03019 while (true) 03020 { 03021 const XMLCh nextCh = fReaderMgr.getNextChar(); 03022 03023 // Watch for unexpected end of file 03024 if (!nextCh) 03025 { 03026 emitError(XMLErrs::UnterminatedCDATASection); 03027 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 03028 } 03029 03030 if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))) 03031 { 03032 // This document is standalone; this ignorable CDATA whitespace is forbidden. 03033 // XML 1.0, Section 2.9 03034 // And see if the current element is a 'Children' style content model 03035 if (topElem->fThisElement->isExternal()) { 03036 03037 if (charOpts == XMLElementDecl::SpacesOk) // Element Content 03038 { 03039 // Error - standalone should have a value of "no" as whitespace detected in an 03040 // element type with element content whose element declaration was external 03041 fValidator->emitError(XMLValid::NoWSForStandalone); 03042 } 03043 } 03044 } 03045 03046 // If this is a close square bracket it could be our closing 03047 // sequence. 03048 if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose)) 03049 { 03050 // make sure we were not expecting a trailing surrogate. 03051 if (gotLeadingSurrogate) 03052 emitError(XMLErrs::Expected2ndSurrogateChar); 03053 03054 if (fValidate) { 03055 03056 if (charOpts != XMLElementDecl::AllCharData) 03057 { 03058 // They definitely cannot handle any type of char data 03059 fValidator->emitError(XMLValid::NoCharDataInCM); 03060 } 03061 } 03062 03063 // If we have a doc handler, call it 03064 if (fDocHandler) 03065 { 03066 fDocHandler->docCharacters 03067 ( 03068 bbCData.getRawBuffer() 03069 , bbCData.getLen() 03070 , true 03071 ); 03072 } 03073 03074 // And we are done 03075 break; 03076 } 03077 03078 // Make sure its a valid character. But if we've emitted an error 03079 // already, don't bother with the overhead since we've already told 03080 // them about it. 03081 if (!emittedError) 03082 { 03083 // Deal with surrogate pairs 03084 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 03085 { 03086 // Its a leading surrogate. If we already got one, then 03087 // issue an error, else set leading flag to make sure that 03088 // we look for a trailing next time. 03089 if (gotLeadingSurrogate) 03090 emitError(XMLErrs::Expected2ndSurrogateChar); 03091 else 03092 gotLeadingSurrogate = true; 03093 } 03094 else 03095 { 03096 // If its a trailing surrogate, make sure that we are 03097 // prepared for that. Else, its just a regular char so make 03098 // sure that we were not expected a trailing surrogate. 03099 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 03100 { 03101 // Its trailing, so make sure we were expecting it 03102 if (!gotLeadingSurrogate) 03103 emitError(XMLErrs::Unexpected2ndSurrogateChar); 03104 } 03105 else 03106 { 03107 // Its just a char, so make sure we were not expecting a 03108 // trailing surrogate. 03109 if (gotLeadingSurrogate) 03110 emitError(XMLErrs::Expected2ndSurrogateChar); 03111 03112 // Its got to at least be a valid XML character 03113 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 03114 { 03115 XMLCh tmpBuf[9]; 03116 XMLString::binToText 03117 ( 03118 nextCh 03119 , tmpBuf 03120 , 8 03121 , 16 03122 , fMemoryManager 03123 ); 03124 emitError(XMLErrs::InvalidCharacter, tmpBuf); 03125 emittedError = true; 03126 } 03127 } 03128 gotLeadingSurrogate = false; 03129 } 03130 } 03131 03132 // Add it to the buffer 03133 bbCData.append(nextCh); 03134 } 03135 } 03136 03137 03138 void DGXMLScanner::scanCharData(XMLBuffer& toUse) 03139 { 03140 // We have to watch for the stupid ]]> sequence, which is illegal in 03141 // character data. So this is a little state machine that handles that. 03142 enum States 03143 { 03144 State_Waiting 03145 , State_GotOne 03146 , State_GotTwo 03147 }; 03148 03149 // Reset the buffer before we start 03150 toUse.reset(); 03151 03152 // Turn on the 'throw at end' flag of the reader manager 03153 ThrowEOEJanitor jan(&fReaderMgr, true); 03154 03155 // In order to be more efficient we have to use kind of a deeply nested 03156 // set of blocks here. The outer block puts on a try and catches end of 03157 // entity exceptions. The inner loop is the per-character loop. If we 03158 // put the try inside the inner loop, it would work but would require 03159 // the exception handling code setup/teardown code to be invoked for 03160 // each character. 03161 XMLCh nextCh; 03162 XMLCh secondCh = 0; 03163 States curState = State_Waiting; 03164 bool escaped = false; 03165 bool gotLeadingSurrogate = false; 03166 bool notDone = true; 03167 while (notDone) 03168 { 03169 try 03170 { 03171 while (true) 03172 { 03173 // Eat through as many plain content characters as possible without 03174 // needing special handling. Moving most content characters here, 03175 // in this one call, rather than running the overall loop once 03176 // per content character, is a speed optimization. 03177 if (curState == State_Waiting && !gotLeadingSurrogate) 03178 { 03179 fReaderMgr.movePlainContentChars(toUse); 03180 } 03181 03182 // Try to get another char from the source 03183 // The code from here on down covers all contengencies, 03184 if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) 03185 { 03186 // If we were waiting for a trailing surrogate, its an error 03187 if (gotLeadingSurrogate) 03188 emitError(XMLErrs::Expected2ndSurrogateChar); 03189 03190 notDone = false; 03191 break; 03192 } 03193 03194 // Watch for a reference. Note that the escapement mechanism 03195 // is ignored in this content. 03196 escaped = false; 03197 if (nextCh == chAmpersand) 03198 { 03199 sendCharData(toUse); 03200 03201 // Turn off the throwing at the end of entity during this 03202 ThrowEOEJanitor jan(&fReaderMgr, false); 03203 03204 if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned) 03205 { 03206 gotLeadingSurrogate = false; 03207 continue; 03208 } 03209 else 03210 { 03211 if (escaped && !fElemStack.isEmpty()) 03212 fElemStack.setReferenceEscaped(); 03213 } 03214 } 03215 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 03216 { 03217 // Deal with surrogate pairs 03218 // Its a leading surrogate. If we already got one, then 03219 // issue an error, else set leading flag to make sure that 03220 // we look for a trailing next time. 03221 if (gotLeadingSurrogate) 03222 emitError(XMLErrs::Expected2ndSurrogateChar); 03223 else 03224 gotLeadingSurrogate = true; 03225 } 03226 else 03227 { 03228 // If its a trailing surrogate, make sure that we are 03229 // prepared for that. Else, its just a regular char so make 03230 // sure that we were not expected a trailing surrogate. 03231 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 03232 { 03233 // Its trailing, so make sure we were expecting it 03234 if (!gotLeadingSurrogate) 03235 emitError(XMLErrs::Unexpected2ndSurrogateChar); 03236 } 03237 else 03238 { 03239 // Its just a char, so make sure we were not expecting a 03240 // trailing surrogate. 03241 if (gotLeadingSurrogate) 03242 emitError(XMLErrs::Expected2ndSurrogateChar); 03243 03244 // Make sure the returned char is a valid XML char 03245 if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 03246 { 03247 XMLCh tmpBuf[9]; 03248 XMLString::binToText 03249 ( 03250 nextCh 03251 , tmpBuf 03252 , 8 03253 , 16 03254 , fMemoryManager 03255 ); 03256 emitError(XMLErrs::InvalidCharacter, tmpBuf); 03257 } 03258 } 03259 gotLeadingSurrogate = false; 03260 } 03261 03262 // Keep the state machine up to date 03263 if (!escaped) 03264 { 03265 if (nextCh == chCloseSquare) 03266 { 03267 if (curState == State_Waiting) 03268 curState = State_GotOne; 03269 else if (curState == State_GotOne) 03270 curState = State_GotTwo; 03271 } 03272 else if (nextCh == chCloseAngle) 03273 { 03274 if (curState == State_GotTwo) 03275 emitError(XMLErrs::BadSequenceInCharData); 03276 curState = State_Waiting; 03277 } 03278 else 03279 { 03280 curState = State_Waiting; 03281 } 03282 } 03283 else 03284 { 03285 curState = State_Waiting; 03286 } 03287 03288 // Add this char to the buffer 03289 toUse.append(nextCh); 03290 03291 if (secondCh) 03292 { 03293 toUse.append(secondCh); 03294 secondCh=0; 03295 } 03296 } 03297 } 03298 catch(const EndOfEntityException& toCatch) 03299 { 03300 // Some entity ended, so we have to send any accumulated 03301 // chars and send an end of entity event. 03302 sendCharData(toUse); 03303 gotLeadingSurrogate = false; 03304 03305 if (fDocHandler) 03306 fDocHandler->endEntityReference(toCatch.getEntity()); 03307 } 03308 } 03309 03310 // Check the validity constraints as per XML 1.0 Section 2.9 03311 if (fValidate && fStandalone) 03312 { 03313 // See if the text contains whitespace 03314 // Get the raw data we need for the callback 03315 const XMLCh* rawBuf = toUse.getRawBuffer(); 03316 const XMLSize_t len = toUse.getLen(); 03317 const bool isSpaces = fReaderMgr.getCurrentReader()->containsWhiteSpace(rawBuf, len); 03318 03319 if (isSpaces) 03320 { 03321 // And see if the current element is a 'Children' style content model 03322 const ElemStack::StackElem* topElem = fElemStack.topElement(); 03323 03324 if (topElem->fThisElement->isExternal()) { 03325 03326 // Get the character data opts for the current element 03327 XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts(); 03328 03329 if (charOpts == XMLElementDecl::SpacesOk) // => Element Content 03330 { 03331 // Error - standalone should have a value of "no" as whitespace detected in an 03332 // element type with element content whose element declaration was external 03333 // 03334 fValidator->emitError(XMLValid::NoWSForStandalone); 03335 } 03336 } 03337 } 03338 } 03339 // Send any char data that we accumulated into the buffer 03340 sendCharData(toUse); 03341 } 03342 03343 03344 // This method will scan a general/character entity ref. It will either 03345 // expand a char ref and return it directly, or push a reader for a general 03346 // entity. 03347 // 03348 // The return value indicates whether the char parameters hold the value 03349 // or whether the value was pushed as a reader, or that it failed. 03350 // 03351 // The escaped flag tells the caller whether the returned parameter resulted 03352 // from a character reference, which escapes the character in some cases. It 03353 // only makes any difference if the return value indicates the value was 03354 // returned directly. 03355 DGXMLScanner::EntityExpRes 03356 DGXMLScanner::scanEntityRef( const bool inAttVal 03357 , XMLCh& firstCh 03358 , XMLCh& secondCh 03359 , bool& escaped) 03360 { 03361 // Assume no escape 03362 secondCh = 0; 03363 escaped = false; 03364 03365 // We have to insure that its all in one entity 03366 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); 03367 03368 // If the next char is a pound, then its a character reference and we 03369 // need to expand it always. 03370 if (fReaderMgr.skippedChar(chPound)) 03371 { 03372 // Its a character reference, so scan it and get back the numeric 03373 // value it represents. 03374 if (!scanCharRef(firstCh, secondCh)) 03375 return EntityExp_Failed; 03376 03377 escaped = true; 03378 03379 if (curReader != fReaderMgr.getCurrentReaderNum()) 03380 emitError(XMLErrs::PartialMarkupInEntity); 03381 03382 return EntityExp_Returned; 03383 } 03384 03385 // Expand it since its a normal entity ref 03386 XMLBufBid bbName(&fBufMgr); 03387 03388 int colonPosition; 03389 bool validName = fDoNamespaces ? fReaderMgr.getQName(bbName.getBuffer(), &colonPosition) : 03390 fReaderMgr.getName(bbName.getBuffer()); 03391 if (!validName) 03392 { 03393 if (bbName.isEmpty()) 03394 emitError(XMLErrs::ExpectedEntityRefName); 03395 else 03396 emitError(XMLErrs::InvalidEntityRefName, bbName.getRawBuffer()); 03397 return EntityExp_Failed; 03398 } 03399 03400 // Next char must be a semi-colon. But if its not, just emit 03401 // an error and try to continue. 03402 if (!fReaderMgr.skippedChar(chSemiColon)) 03403 emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); 03404 03405 // Make sure we ended up on the same entity reader as the & char 03406 if (curReader != fReaderMgr.getCurrentReaderNum()) 03407 emitError(XMLErrs::PartialMarkupInEntity); 03408 03409 // Look up the name in the general entity pool 03410 XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer()); 03411 03412 // If it does not exist, then obviously an error 03413 if (!decl) 03414 { 03415 // XML 1.0 Section 4.1 03416 // Well-formedness Constraint for entity not found: 03417 // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references, 03418 // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset 03419 // or a parameter entity 03420 // 03421 // Else it's Validity Constraint 03422 if (fStandalone || fHasNoDTD) 03423 emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); 03424 else { 03425 if (fValidate) 03426 fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer()); 03427 } 03428 03429 return EntityExp_Failed; 03430 } 03431 03432 // XML 1.0 Section 4.1 03433 // If we are a standalone document, then it has to have been declared 03434 // in the internal subset. 03435 if (fStandalone && !decl->getDeclaredInIntSubset()) 03436 emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer()); 03437 03438 if (decl->isExternal()) 03439 { 03440 // If its unparsed, then its not valid here 03441 if (decl->isUnparsed()) 03442 { 03443 emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer()); 03444 return EntityExp_Failed; 03445 } 03446 03447 // If we are in an attribute value, then not valid but keep going 03448 if (inAttVal) 03449 emitError(XMLErrs::NoExtRefsInAttValue); 03450 03451 // And now create a reader to read this entity 03452 InputSource* srcUsed; 03453 XMLReader* reader = fReaderMgr.createReader 03454 ( 03455 decl->getBaseURI() 03456 , decl->getSystemId() 03457 , decl->getPublicId() 03458 , false 03459 , XMLReader::RefFrom_NonLiteral 03460 , XMLReader::Type_General 03461 , XMLReader::Source_External 03462 , srcUsed 03463 , fCalculateSrcOfs 03464 , fLowWaterMark 03465 , fDisableDefaultEntityResolution 03466 ); 03467 03468 // Put a janitor on the source so it gets cleaned up on exit 03469 Janitor<InputSource> janSrc(srcUsed); 03470 03471 // If the creation failed, and its not because the source was empty, 03472 // then emit an error and return. 03473 if (!reader) 03474 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager); 03475 03476 // Push the reader. If its a recursive expansion, then emit an error 03477 // and return an failure. 03478 if (!fReaderMgr.pushReader(reader, decl)) 03479 { 03480 emitError(XMLErrs::RecursiveEntity, decl->getName()); 03481 return EntityExp_Failed; 03482 } 03483 03484 // here's where we need to check if there's a SecurityManager, 03485 // how many entity references we've had 03486 if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { 03487 XMLCh expLimStr[32]; 03488 XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); 03489 emitError 03490 ( 03491 XMLErrs::EntityExpansionLimitExceeded 03492 , expLimStr 03493 ); 03494 // there seems nothing better to do than reset the entity expansion counter 03495 fEntityExpansionCount = 0; 03496 } 03497 03498 // Do a start entity reference event. 03499 // 03500 // <TBD> For now, we supress them in att values. Later, when 03501 // the stuff is in place to correctly allow DOM to handle them 03502 // we'll turn this back on. 03503 if (fDocHandler && !inAttVal) 03504 fDocHandler->startEntityReference(*decl); 03505 03506 // If it starts with the XML string, then parse a text decl 03507 if (checkXMLDecl(true)) 03508 scanXMLDecl(Decl_Text); 03509 } 03510 else 03511 { 03512 // If its one of the special char references, then we can return 03513 // it as a character, and its considered escaped. 03514 if (decl->getIsSpecialChar()) 03515 { 03516 firstCh = decl->getValue()[0]; 03517 escaped = true; 03518 return EntityExp_Returned; 03519 } 03520 03521 // Create a reader over a memory stream over the entity value 03522 // We force it to assume UTF-16 by passing in an encoding 03523 // string. This way it won't both trying to predecode the 03524 // first line, looking for an XML/TextDecl. 03525 XMLReader* valueReader = fReaderMgr.createIntEntReader 03526 ( 03527 decl->getName() 03528 , XMLReader::RefFrom_NonLiteral 03529 , XMLReader::Type_General 03530 , decl->getValue() 03531 , decl->getValueLen() 03532 , false 03533 ); 03534 03535 // Try to push the entity reader onto the reader manager stack, 03536 // where it will become the subsequent input. If it fails, that 03537 // means the entity is recursive, so issue an error. The reader 03538 // will have just been discarded, but we just keep going. 03539 if (!fReaderMgr.pushReader(valueReader, decl)) 03540 emitError(XMLErrs::RecursiveEntity, decl->getName()); 03541 03542 // here's where we need to check if there's a SecurityManager, 03543 // how many entity references we've had 03544 if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { 03545 XMLCh expLimStr[32]; 03546 XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); 03547 emitError 03548 ( 03549 XMLErrs::EntityExpansionLimitExceeded 03550 , expLimStr 03551 ); 03552 } 03553 03554 // Do a start entity reference event. 03555 // 03556 // <TBD> For now, we supress them in att values. Later, when 03557 // the stuff is in place to correctly allow DOM to handle them 03558 // we'll turn this back on. 03559 if (fDocHandler && !inAttVal) 03560 fDocHandler->startEntityReference(*decl); 03561 03562 // If it starts with the XML string, then it's an error 03563 if (checkXMLDecl(true)) { 03564 emitError(XMLErrs::TextDeclNotLegalHere); 03565 fReaderMgr.skipPastChar(chCloseAngle); 03566 } 03567 } 03568 return EntityExp_Pushed; 03569 } 03570 03571 03572 XERCES_CPP_NAMESPACE_END