GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: WFXMLScanner.cpp 833045 2009-11-05 13:21:27Z borisk $ 00020 */ 00021 00022 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/internal/WFXMLScanner.hpp> 00027 #include <xercesc/util/Janitor.hpp> 00028 #include <xercesc/util/RuntimeException.hpp> 00029 #include <xercesc/util/UnexpectedEOFException.hpp> 00030 #include <xercesc/sax/InputSource.hpp> 00031 #include <xercesc/framework/XMLDocumentHandler.hpp> 00032 #include <xercesc/framework/XMLEntityHandler.hpp> 00033 #include <xercesc/framework/XMLPScanToken.hpp> 00034 #include <xercesc/framework/XMLValidityCodes.hpp> 00035 #include <xercesc/internal/EndOfEntityException.hpp> 00036 #include <xercesc/util/OutOfMemoryException.hpp> 00037 00038 XERCES_CPP_NAMESPACE_BEGIN 00039 00040 // --------------------------------------------------------------------------- 00041 // WFXMLScanner: Constructors and Destructor 00042 // --------------------------------------------------------------------------- 00043 00044 00045 typedef JanitorMemFunCall<WFXMLScanner> CleanupType; 00046 typedef JanitorMemFunCall<ReaderMgr> ReaderMgrResetType; 00047 00048 00049 WFXMLScanner::WFXMLScanner( XMLValidator* const valToAdopt 00050 , GrammarResolver* const grammarResolver 00051 , MemoryManager* const manager) : 00052 00053 XMLScanner(valToAdopt, grammarResolver, manager) 00054 , fElementIndex(0) 00055 , fElements(0) 00056 , fEntityTable(0) 00057 , fAttrNameHashList(0) 00058 , fAttrNSList(0) 00059 , fElementLookup(0) 00060 { 00061 CleanupType cleanup(this, &WFXMLScanner::cleanUp); 00062 00063 try 00064 { 00065 commonInit(); 00066 } 00067 catch(const OutOfMemoryException&) 00068 { 00069 // Don't cleanup when out of memory, since executing the 00070 // code can cause problems. 00071 cleanup.release(); 00072 00073 throw; 00074 } 00075 00076 cleanup.release(); 00077 } 00078 00079 WFXMLScanner::WFXMLScanner( XMLDocumentHandler* const docHandler 00080 , DocTypeHandler* const docTypeHandler 00081 , XMLEntityHandler* const entityHandler 00082 , XMLErrorReporter* const errHandler 00083 , XMLValidator* const valToAdopt 00084 , GrammarResolver* const grammarResolver 00085 , MemoryManager* const manager) : 00086 00087 XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager) 00088 , fElementIndex(0) 00089 , fElements(0) 00090 , fEntityTable(0) 00091 , fAttrNameHashList(0) 00092 , fAttrNSList(0) 00093 , fElementLookup(0) 00094 { 00095 CleanupType cleanup(this, &WFXMLScanner::cleanUp); 00096 00097 try 00098 { 00099 commonInit(); 00100 } 00101 catch(const OutOfMemoryException&) 00102 { 00103 // Don't cleanup when out of memory, since executing the 00104 // code can cause problems. 00105 cleanup.release(); 00106 00107 throw; 00108 } 00109 00110 cleanup.release(); 00111 } 00112 00113 WFXMLScanner::~WFXMLScanner() 00114 { 00115 cleanUp(); 00116 } 00117 00118 // --------------------------------------------------------------------------- 00119 // XMLScanner: Getter methods 00120 // --------------------------------------------------------------------------- 00121 NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() 00122 { 00123 return 0; 00124 } 00125 00126 const NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() const 00127 { 00128 return 0; 00129 } 00130 00131 // --------------------------------------------------------------------------- 00132 // WFXMLScanner: Main entry point to scan a document 00133 // --------------------------------------------------------------------------- 00134 void WFXMLScanner::scanDocument(const InputSource& src) 00135 { 00136 // Bump up the sequence id for this parser instance. This will invalidate 00137 // any previous progressive scan tokens. 00138 fSequenceId++; 00139 00140 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 00141 00142 try 00143 { 00144 // Reset the scanner and its plugged in stuff for a new run. This 00145 // resets all the data structures, creates the initial reader and 00146 // pushes it on the stack, and sets up the base document path. 00147 scanReset(src); 00148 00149 // If we have a document handler, then call the start document 00150 if (fDocHandler) 00151 fDocHandler->startDocument(); 00152 00153 // Scan the prolog part, which is everything before the root element 00154 // including the DTD subsets. 00155 scanProlog(); 00156 00157 // If we got to the end of input, then its not a valid XML file. 00158 // Else, go on to scan the content. 00159 if (fReaderMgr.atEOF()) 00160 { 00161 emitError(XMLErrs::EmptyMainEntity); 00162 } 00163 else 00164 { 00165 // Scan content, and tell it its not an external entity 00166 if (scanContent()) 00167 { 00168 // That went ok, so scan for any miscellaneous stuff 00169 if (!fReaderMgr.atEOF()) 00170 scanMiscellaneous(); 00171 } 00172 } 00173 00174 // If we have a document handler, then call the end document 00175 if (fDocHandler) 00176 fDocHandler->endDocument(); 00177 } 00178 // NOTE: 00179 // 00180 // In all of the error processing below, the emitError() call MUST come 00181 // before the flush of the reader mgr, or it will fail because it tries 00182 // to find out the position in the XML source of the error. 00183 catch(const XMLErrs::Codes) 00184 { 00185 // This is a 'first failure' exception, so fall through 00186 } 00187 catch(const XMLValid::Codes) 00188 { 00189 // This is a 'first fatal error' type exit, so fall through 00190 } 00191 catch(const XMLException& excToCatch) 00192 { 00193 // Emit the error and catch any user exception thrown from here. Make 00194 // sure in all cases we flush the reader manager. 00195 fInException = true; 00196 try 00197 { 00198 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00199 emitError 00200 ( 00201 XMLErrs::XMLException_Warning 00202 , excToCatch.getCode() 00203 , excToCatch.getMessage() 00204 ); 00205 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00206 emitError 00207 ( 00208 XMLErrs::XMLException_Fatal 00209 , excToCatch.getCode() 00210 , excToCatch.getMessage() 00211 ); 00212 else 00213 emitError 00214 ( 00215 XMLErrs::XMLException_Error 00216 , excToCatch.getCode() 00217 , excToCatch.getMessage() 00218 ); 00219 } 00220 catch(const OutOfMemoryException&) 00221 { 00222 // This is a special case for out-of-memory 00223 // conditions, because resetting the ReaderMgr 00224 // can be problematic. 00225 resetReaderMgr.release(); 00226 00227 throw; 00228 } 00229 } 00230 catch(const OutOfMemoryException&) 00231 { 00232 // This is a special case for out-of-memory 00233 // conditions, because resetting the ReaderMgr 00234 // can be problematic. 00235 resetReaderMgr.release(); 00236 00237 throw; 00238 } 00239 } 00240 00241 00242 bool WFXMLScanner::scanNext(XMLPScanToken& token) 00243 { 00244 // Make sure this token is still legal 00245 if (!isLegalToken(token)) 00246 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager); 00247 00248 // Find the next token and remember the reader id 00249 XMLSize_t orgReader; 00250 XMLTokens curToken; 00251 bool retVal = true; 00252 00253 ReaderMgrResetType resetReaderMgr(&fReaderMgr, &ReaderMgr::reset); 00254 00255 try 00256 { 00257 while (true) 00258 { 00259 // We have to handle any end of entity exceptions that happen here. 00260 // We could be at the end of X nested entities, each of which will 00261 // generate an end of entity exception as we try to move forward. 00262 try 00263 { 00264 curToken = senseNextToken(orgReader); 00265 break; 00266 } 00267 catch(const EndOfEntityException& toCatch) 00268 { 00269 // Send an end of entity reference event 00270 if (fDocHandler) 00271 fDocHandler->endEntityReference(toCatch.getEntity()); 00272 } 00273 } 00274 00275 if (curToken == Token_CharData) 00276 { 00277 scanCharData(fCDataBuf); 00278 } 00279 else if (curToken == Token_EOF) 00280 { 00281 if (!fElemStack.isEmpty()) 00282 { 00283 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00284 emitError 00285 ( 00286 XMLErrs::EndedWithTagsOnStack 00287 , topElem->fThisElement->getFullName() 00288 ); 00289 } 00290 00291 retVal = false; 00292 } 00293 else 00294 { 00295 // Its some sort of markup 00296 bool gotData = true; 00297 switch(curToken) 00298 { 00299 case Token_CData : 00300 // Make sure we are within content 00301 if (fElemStack.isEmpty()) 00302 emitError(XMLErrs::CDATAOutsideOfContent); 00303 scanCDSection(); 00304 break; 00305 00306 case Token_Comment : 00307 scanComment(); 00308 break; 00309 00310 case Token_EndTag : 00311 scanEndTag(gotData); 00312 break; 00313 00314 case Token_PI : 00315 scanPI(); 00316 break; 00317 00318 case Token_StartTag : 00319 if (fDoNamespaces) 00320 scanStartTagNS(gotData); 00321 else 00322 scanStartTag(gotData); 00323 break; 00324 00325 default : 00326 fReaderMgr.skipToChar(chOpenAngle); 00327 break; 00328 } 00329 00330 if (orgReader != fReaderMgr.getCurrentReaderNum()) 00331 emitError(XMLErrs::PartialMarkupInEntity); 00332 00333 // If we hit the end, then do the miscellaneous part 00334 if (!gotData) 00335 { 00336 // That went ok, so scan for any miscellaneous stuff 00337 scanMiscellaneous(); 00338 00339 if (fDocHandler) 00340 fDocHandler->endDocument(); 00341 } 00342 } 00343 } 00344 // NOTE: 00345 // 00346 // In all of the error processing below, the emitError() call MUST come 00347 // before the flush of the reader mgr, or it will fail because it tries 00348 // to find out the position in the XML source of the error. 00349 catch(const XMLErrs::Codes) 00350 { 00351 // This is a 'first failure' exception, so return failure 00352 retVal = false; 00353 } 00354 catch(const XMLValid::Codes) 00355 { 00356 // This is a 'first fatal error' type exit, so return failure 00357 retVal = false; 00358 } 00359 catch(const XMLException& excToCatch) 00360 { 00361 // Emit the error and catch any user exception thrown from here. Make 00362 // sure in all cases we flush the reader manager. 00363 fInException = true; 00364 try 00365 { 00366 if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning) 00367 emitError 00368 ( 00369 XMLErrs::XMLException_Warning 00370 , excToCatch.getCode() 00371 , excToCatch.getMessage() 00372 ); 00373 else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal) 00374 emitError 00375 ( 00376 XMLErrs::XMLException_Fatal 00377 , excToCatch.getCode() 00378 , excToCatch.getMessage() 00379 ); 00380 else 00381 emitError 00382 ( 00383 XMLErrs::XMLException_Error 00384 , excToCatch.getCode() 00385 , excToCatch.getMessage() 00386 ); 00387 } 00388 catch(const OutOfMemoryException&) 00389 { 00390 // This is a special case for out-of-memory 00391 // conditions, because resetting the ReaderMgr 00392 // can be problematic. 00393 resetReaderMgr.release(); 00394 00395 throw; 00396 } 00397 00398 // Return failure 00399 retVal = false; 00400 } 00401 catch(const OutOfMemoryException&) 00402 { 00403 throw; 00404 } 00405 00406 // If we are not at the end, release the object that will 00407 // reset the ReaderMgr. 00408 if (retVal) 00409 resetReaderMgr.release(); 00410 00411 return retVal; 00412 } 00413 00414 00415 00416 // --------------------------------------------------------------------------- 00417 // WFXMLScanner: Private helper methods. 00418 // --------------------------------------------------------------------------- 00419 00420 // This method handles the common initialization, to avoid having to do 00421 // it redundantly in multiple constructors. 00422 void WFXMLScanner::commonInit() 00423 { 00424 fEntityTable = new (fMemoryManager) ValueHashTableOf<XMLCh>(11, fMemoryManager); 00425 fAttrNameHashList = new (fMemoryManager)ValueVectorOf<XMLSize_t>(16, fMemoryManager); 00426 fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager); 00427 fElements = new (fMemoryManager) RefVectorOf<XMLElementDecl>(32, true, fMemoryManager); 00428 fElementLookup = new (fMemoryManager) RefHashTableOf<XMLElementDecl>(109, false, fMemoryManager); 00429 00430 // Add the default entity entries for the character refs that must always 00431 // be present. 00432 fEntityTable->put((void*) XMLUni::fgAmp, chAmpersand); 00433 fEntityTable->put((void*) XMLUni::fgLT, chOpenAngle); 00434 fEntityTable->put((void*) XMLUni::fgGT, chCloseAngle); 00435 fEntityTable->put((void*) XMLUni::fgQuot, chDoubleQuote); 00436 fEntityTable->put((void*) XMLUni::fgApos, chSingleQuote); 00437 } 00438 00439 void WFXMLScanner::cleanUp() 00440 { 00441 delete fEntityTable; 00442 delete fAttrNameHashList; 00443 delete fAttrNSList; 00444 delete fElementLookup; 00445 delete fElements; 00446 } 00447 00448 // This method will reset the scanner data structures, and related plugged 00449 // in stuff, for a new scan session. We get the input source for the primary 00450 // XML entity, create the reader for it, and push it on the stack so that 00451 // upon successful return from here we are ready to go. 00452 void WFXMLScanner::scanReset(const InputSource& src) 00453 { 00454 // For all installed handlers, send reset events. This gives them 00455 // a chance to flush any cached data. 00456 if (fDocHandler) 00457 fDocHandler->resetDocument(); 00458 if (fEntityHandler) 00459 fEntityHandler->resetEntities(); 00460 if (fErrorReporter) 00461 fErrorReporter->resetErrors(); 00462 00463 // Reset the element stack, and give it the latest ids for the special 00464 // URIs it has to know about. 00465 fElemStack.reset 00466 ( 00467 fEmptyNamespaceId 00468 , fUnknownNamespaceId 00469 , fXMLNamespaceId 00470 , fXMLNSNamespaceId 00471 ); 00472 00473 // Reset some status flags 00474 fInException = false; 00475 fStandalone = false; 00476 fErrorCount = 0; 00477 fHasNoDTD = true; 00478 fElementIndex = 0; 00479 00480 // Reset elements lookup table 00481 fElementLookup->removeAll(); 00482 00483 // Handle the creation of the XML reader object for this input source. 00484 // This will provide us with transcoding and basic lexing services. 00485 XMLReader* newReader = fReaderMgr.createReader 00486 ( 00487 src 00488 , true 00489 , XMLReader::RefFrom_NonLiteral 00490 , XMLReader::Type_General 00491 , XMLReader::Source_External 00492 , fCalculateSrcOfs 00493 , fLowWaterMark 00494 ); 00495 00496 if (!newReader) { 00497 if (src.getIssueFatalErrorIfNotFound()) 00498 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager); 00499 else 00500 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager); 00501 } 00502 00503 // Push this read onto the reader manager 00504 fReaderMgr.pushReader(newReader, 0); 00505 00506 // and reset security-related things if necessary: 00507 if(fSecurityManager != 0) 00508 { 00509 fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit(); 00510 fEntityExpansionCount = 0; 00511 } 00512 } 00513 00514 // This method is called between markup in content. It scans for character 00515 // data that is sent to the document handler. It watches for any markup 00516 // characters that would indicate that the character data has ended. It also 00517 // handles expansion of general and character entities. 00518 // 00519 // sendData() is a local static helper for this method which handles some 00520 // code that must be done in three different places here. 00521 void WFXMLScanner::sendCharData(XMLBuffer& toSend) 00522 { 00523 // If no data in the buffer, then nothing to do 00524 if (toSend.isEmpty()) 00525 return; 00526 00527 // Always assume its just char data if not validating 00528 if (fDocHandler) 00529 fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false); 00530 00531 // Reset buffer 00532 toSend.reset(); 00533 } 00534 00535 // --------------------------------------------------------------------------- 00536 // WFXMLScanner: Private scanning methods 00537 // --------------------------------------------------------------------------- 00538 00539 // This method will kick off the scanning of the primary content of the 00540 // document, i.e. the elements. 00541 bool WFXMLScanner::scanContent() 00542 { 00543 // Go into a loop until we hit the end of the root element, or we fall 00544 // out because there is no root element. 00545 // 00546 // We have to do kind of a deeply nested double loop here in order to 00547 // avoid doing the setup/teardown of the exception handler on each 00548 // round. Doing it this way we only do it when an exception actually 00549 // occurs. 00550 bool gotData = true; 00551 bool inMarkup = false; 00552 while (gotData) 00553 { 00554 try 00555 { 00556 while (gotData) 00557 { 00558 // Sense what the next top level token is. According to what 00559 // this tells us, we will call something to handle that kind 00560 // of thing. 00561 XMLSize_t orgReader; 00562 const XMLTokens curToken = senseNextToken(orgReader); 00563 00564 // Handle character data and end of file specially. Char data 00565 // is not markup so we don't want to handle it in the loop 00566 // below. 00567 if (curToken == Token_CharData) 00568 { 00569 // Scan the character data and call appropriate events. Let 00570 // him use our local character data buffer for efficiency. 00571 scanCharData(fCDataBuf); 00572 continue; 00573 } 00574 else if (curToken == Token_EOF) 00575 { 00576 // The element stack better be empty at this point or we 00577 // ended prematurely before all elements were closed. 00578 if (!fElemStack.isEmpty()) 00579 { 00580 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00581 emitError 00582 ( 00583 XMLErrs::EndedWithTagsOnStack 00584 , topElem->fThisElement->getFullName() 00585 ); 00586 } 00587 00588 // Its the end of file, so clear the got data flag 00589 gotData = false; 00590 continue; 00591 } 00592 00593 // We are in some sort of markup now 00594 inMarkup = true; 00595 00596 // According to the token we got, call the appropriate 00597 // scanning method. 00598 switch(curToken) 00599 { 00600 case Token_CData : 00601 // Make sure we are within content 00602 if (fElemStack.isEmpty()) 00603 emitError(XMLErrs::CDATAOutsideOfContent); 00604 scanCDSection(); 00605 break; 00606 00607 case Token_Comment : 00608 scanComment(); 00609 break; 00610 00611 case Token_EndTag : 00612 scanEndTag(gotData); 00613 break; 00614 00615 case Token_PI : 00616 scanPI(); 00617 break; 00618 00619 case Token_StartTag : 00620 if (fDoNamespaces) 00621 scanStartTagNS(gotData); 00622 else 00623 scanStartTag(gotData); 00624 break; 00625 00626 default : 00627 fReaderMgr.skipToChar(chOpenAngle); 00628 break; 00629 } 00630 00631 if (orgReader != fReaderMgr.getCurrentReaderNum()) 00632 emitError(XMLErrs::PartialMarkupInEntity); 00633 00634 // And we are back out of markup again 00635 inMarkup = false; 00636 } 00637 } 00638 catch(const EndOfEntityException& toCatch) 00639 { 00640 // If we were in some markup when this happened, then its a 00641 // partial markup error. 00642 if (inMarkup) 00643 emitError(XMLErrs::PartialMarkupInEntity); 00644 00645 // Send an end of entity reference event 00646 if (fDocHandler) 00647 fDocHandler->endEntityReference(toCatch.getEntity()); 00648 00649 inMarkup = false; 00650 } 00651 } 00652 00653 // It went ok, so return success 00654 return true; 00655 } 00656 00657 00658 void WFXMLScanner::scanEndTag(bool& gotData) 00659 { 00660 // Assume we will still have data until proven otherwise. It will only 00661 // ever be false if this is the end of the root element. 00662 gotData = true; 00663 00664 // Check if the element stack is empty. If so, then this is an unbalanced 00665 // element (i.e. more ends than starts, perhaps because of bad text 00666 // causing one to be skipped.) 00667 if (fElemStack.isEmpty()) 00668 { 00669 emitError(XMLErrs::MoreEndThanStartTags); 00670 fReaderMgr.skipPastChar(chCloseAngle); 00671 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager); 00672 } 00673 00674 // Pop the stack of the element we are supposed to be ending. Remember 00675 // that we don't own this. The stack just keeps them and reuses them. 00676 unsigned int uriId = (fDoNamespaces) 00677 ? fElemStack.getCurrentURI() : fEmptyNamespaceId; 00678 const ElemStack::StackElem* topElem = fElemStack.popTop(); 00679 00680 // See if it was the root element, to avoid multiple calls below 00681 const bool isRoot = fElemStack.isEmpty(); 00682 00683 // Make sure that its the end of the element that we expect 00684 if (!fReaderMgr.skippedStringLong(topElem->fThisElement->getFullName())) 00685 { 00686 emitError 00687 ( 00688 XMLErrs::ExpectedEndOfTagX 00689 , topElem->fThisElement->getFullName() 00690 ); 00691 fReaderMgr.skipPastChar(chCloseAngle); 00692 return; 00693 } 00694 00695 // Make sure we are back on the same reader as where we started 00696 if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum()) 00697 emitError(XMLErrs::PartialTagMarkupError); 00698 00699 // Skip optional whitespace 00700 fReaderMgr.skipPastSpaces(); 00701 00702 // Make sure we find the closing bracket 00703 if (!fReaderMgr.skippedChar(chCloseAngle)) 00704 { 00705 emitError 00706 ( 00707 XMLErrs::UnterminatedEndTag 00708 , topElem->fThisElement->getFullName() 00709 ); 00710 } 00711 00712 // If we have a doc handler, tell it about the end tag 00713 if (fDocHandler) 00714 { 00715 fDocHandler->endElement 00716 ( 00717 *topElem->fThisElement 00718 , uriId 00719 , isRoot 00720 , topElem->fThisElement->getElementName()->getPrefix() 00721 ); 00722 } 00723 00724 // If this was the root, then done with content 00725 gotData = !isRoot; 00726 } 00727 00728 void WFXMLScanner::scanDocTypeDecl() 00729 { 00730 // Just skips over it 00731 // REVISIT: Should we issue a warning 00732 static const XMLCh doctypeIE[] = 00733 { 00734 chOpenSquare, chCloseAngle, chNull 00735 }; 00736 XMLCh nextCh = fReaderMgr.skipUntilIn(doctypeIE); 00737 00738 if (nextCh == chOpenSquare) 00739 fReaderMgr.skipPastChar(chCloseSquare); 00740 00741 fReaderMgr.skipPastChar(chCloseAngle); 00742 } 00743 00744 bool WFXMLScanner::scanStartTag(bool& gotData) 00745 { 00746 // Assume we will still have data until proven otherwise. It will only 00747 // ever be false if this is the root and its empty. 00748 gotData = true; 00749 00750 // Get the QName. In this case, we are not doing namespaces, so we just 00751 // use it as is and don't have to break it into parts. 00752 if (!fReaderMgr.getName(fQNameBuf)) 00753 { 00754 emitError(XMLErrs::ExpectedElementName); 00755 fReaderMgr.skipToChar(chOpenAngle); 00756 return false; 00757 } 00758 00759 // Assume it won't be an empty tag 00760 bool isEmpty = false; 00761 00762 // See if its the root element 00763 const bool isRoot = fElemStack.isEmpty(); 00764 00765 // Lets try to look up the element 00766 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); 00767 XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf); 00768 00769 if (!elemDecl) { 00770 00771 if (fElementIndex < fElements->size()) { 00772 elemDecl = fElements->elementAt(fElementIndex); 00773 } 00774 else { 00775 elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl 00776 ( 00777 fGrammarPoolMemoryManager 00778 ); 00779 fElements->addElement(elemDecl); 00780 } 00781 00782 elemDecl->setElementName(XMLUni::fgZeroLenString, qnameRawBuf, fEmptyNamespaceId); 00783 fElementLookup->put((void*)elemDecl->getFullName(), elemDecl); 00784 fElementIndex++; 00785 } 00786 00787 // Expand the element stack and add the new element 00788 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); 00789 00790 // Skip any whitespace after the name 00791 fReaderMgr.skipPastSpaces(); 00792 00793 // We loop until we either see a /> or >, handling attribute/value 00794 // pairs until we get there. 00795 XMLSize_t attCount = 0; 00796 XMLSize_t curAttListSize = fAttrList->size(); 00797 while (true) 00798 { 00799 // And get the next non-space character 00800 XMLCh nextCh = fReaderMgr.peekNextChar(); 00801 00802 // If the next character is not a slash or closed angle bracket, 00803 // then it must be whitespace, since whitespace is required 00804 // between the end of the last attribute and the name of the next 00805 // one. 00806 if (attCount) 00807 { 00808 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) 00809 { 00810 bool bFoundSpace; 00811 fReaderMgr.skipPastSpaces(bFoundSpace); 00812 if (!bFoundSpace) 00813 { 00814 // Emit the error but keep on going 00815 emitError(XMLErrs::ExpectedWhitespace); 00816 } 00817 // Ok, peek another char 00818 nextCh = fReaderMgr.peekNextChar(); 00819 } 00820 } 00821 00822 // Ok, here we first check for any of the special case characters. 00823 // If its not one, then we do the normal case processing, which 00824 // assumes that we've hit an attribute value, Otherwise, we do all 00825 // the special case checks. 00826 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) 00827 { 00828 // Assume its going to be an attribute, so get a name from 00829 // the input. 00830 if (!fReaderMgr.getName(fAttNameBuf)) 00831 { 00832 emitError(XMLErrs::ExpectedAttrName); 00833 fReaderMgr.skipPastChar(chCloseAngle); 00834 return false; 00835 } 00836 00837 // And next must be an equal sign 00838 if (!scanEq()) 00839 { 00840 static const XMLCh tmpList[] = 00841 { 00842 chSingleQuote, chDoubleQuote, chCloseAngle 00843 , chOpenAngle, chForwardSlash, chNull 00844 }; 00845 00846 emitError(XMLErrs::ExpectedEqSign); 00847 00848 // Try to sync back up by skipping forward until we either 00849 // hit something meaningful. 00850 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 00851 00852 if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) 00853 { 00854 // Jump back to top for normal processing of these 00855 continue; 00856 } 00857 else if ((chFound == chSingleQuote) 00858 || (chFound == chDoubleQuote) 00859 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 00860 { 00861 // Just fall through assuming that the value is to follow 00862 } 00863 else if (chFound == chOpenAngle) 00864 { 00865 // Assume a malformed tag and that new one is starting 00866 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 00867 return false; 00868 } 00869 else 00870 { 00871 // Something went really wrong 00872 return false; 00873 } 00874 } 00875 00876 // See if this attribute is declared more than one for this element. 00877 const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer(); 00878 XMLSize_t attNameHash = XMLString::hash(attNameRawBuf, 109); 00879 00880 if (attCount) { 00881 00882 for (XMLSize_t k=0; k < attCount; k++) { 00883 00884 if (fAttrNameHashList->elementAt(k) == attNameHash) { 00885 if ( 00886 XMLString::equals 00887 ( 00888 fAttrList->elementAt(k)->getName() 00889 , attNameRawBuf 00890 ) 00891 ) 00892 { 00893 emitError 00894 ( 00895 XMLErrs::AttrAlreadyUsedInSTag 00896 , attNameRawBuf 00897 , qnameRawBuf 00898 ); 00899 break; 00900 } 00901 } 00902 } 00903 } 00904 00905 // Skip any whitespace before the value and then scan the att 00906 // value. This will come back normalized with entity refs and 00907 // char refs expanded. 00908 fReaderMgr.skipPastSpaces(); 00909 if (!scanAttValue(attNameRawBuf, fAttValueBuf)) 00910 { 00911 static const XMLCh tmpList[] = 00912 { 00913 chCloseAngle, chOpenAngle, chForwardSlash, chNull 00914 }; 00915 00916 emitError(XMLErrs::ExpectedAttrValue); 00917 00918 // It failed, so lets try to get synced back up. We skip 00919 // forward until we find some whitespace or one of the 00920 // chars in our list. 00921 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 00922 00923 if ((chFound == chCloseAngle) 00924 || (chFound == chForwardSlash) 00925 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 00926 { 00927 // Just fall through and process this attribute, though 00928 // the value will be "". 00929 } 00930 else if (chFound == chOpenAngle) 00931 { 00932 // Assume a malformed tag and that new one is starting 00933 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 00934 return false; 00935 } 00936 else 00937 { 00938 // Something went really wrong 00939 return false; 00940 } 00941 } 00942 00943 // Add this attribute to the attribute list that we use to 00944 // pass them to the handler. We reuse its existing elements 00945 // but expand it as required. 00946 XMLAttr* curAtt; 00947 if (attCount >= curAttListSize) 00948 { 00949 curAtt = new (fMemoryManager) XMLAttr 00950 ( 00951 0 00952 , attNameRawBuf 00953 , XMLUni::fgZeroLenString 00954 , fAttValueBuf.getRawBuffer() 00955 , XMLAttDef::CData 00956 , true 00957 , fMemoryManager 00958 ); 00959 fAttrList->addElement(curAtt); 00960 fAttrNameHashList->addElement(attNameHash); 00961 } 00962 else 00963 { 00964 curAtt = fAttrList->elementAt(attCount); 00965 curAtt->set 00966 ( 00967 0 00968 , attNameRawBuf 00969 , XMLUni::fgZeroLenString 00970 , fAttValueBuf.getRawBuffer() 00971 ); 00972 curAtt->setSpecified(true); 00973 fAttrNameHashList->setElementAt(attNameHash, attCount); 00974 } 00975 attCount++; 00976 00977 // And jump back to the top of the loop 00978 continue; 00979 } 00980 00981 // It was some special case character so do all of the checks and 00982 // deal with it. 00983 if (!nextCh) 00984 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 00985 00986 if (nextCh == chForwardSlash) 00987 { 00988 fReaderMgr.getNextChar(); 00989 isEmpty = true; 00990 if (!fReaderMgr.skippedChar(chCloseAngle)) 00991 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 00992 break; 00993 } 00994 else if (nextCh == chCloseAngle) 00995 { 00996 fReaderMgr.getNextChar(); 00997 break; 00998 } 00999 else if (nextCh == chOpenAngle) 01000 { 01001 // Check for this one specially, since its going to be common 01002 // and it is kind of auto-recovering since we've already hit the 01003 // next open bracket, which is what we would have seeked to (and 01004 // skipped this whole tag.) 01005 emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName()); 01006 break; 01007 } 01008 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) 01009 { 01010 // Check for this one specially, which is probably a missing 01011 // attribute name, e.g. ="value". Just issue expected name 01012 // error and eat the quoted string, then jump back to the 01013 // top again. 01014 emitError(XMLErrs::ExpectedAttrName); 01015 fReaderMgr.getNextChar(); 01016 fReaderMgr.skipQuotedString(nextCh); 01017 fReaderMgr.skipPastSpaces(); 01018 continue; 01019 } 01020 } 01021 01022 // If empty, validate content right now if we are validating and then 01023 // pop the element stack top. Else, we have to update the current stack 01024 // top's namespace mapping elements. 01025 if (isEmpty) 01026 { 01027 // Pop the element stack back off since it'll never be used now 01028 fElemStack.popTop(); 01029 01030 // If the elem stack is empty, then it was an empty root 01031 if (isRoot) 01032 gotData = false; 01033 } 01034 01035 // If we have a document handler, then tell it about this start tag. We 01036 // don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send 01037 // any prefix since its just one big name if we are not doing namespaces. 01038 if (fDocHandler) 01039 { 01040 fDocHandler->startElement 01041 ( 01042 *elemDecl 01043 , fEmptyNamespaceId 01044 , 0 01045 , *fAttrList 01046 , attCount 01047 , isEmpty 01048 , isRoot 01049 ); 01050 } 01051 01052 return true; 01053 } 01054 01055 01056 // This method is called to scan a start tag when we are processing 01057 // namespaces. There are two different versions of this method, one for 01058 // namespace aware processing an done for non-namespace aware processing. 01059 // 01060 // This method is called after we've scanned the < of a start tag. So we 01061 // have to get the element name, then scan the attributes, after which 01062 // we are either going to see >, />, or attributes followed by one of those 01063 // sequences. 01064 bool WFXMLScanner::scanStartTagNS(bool& gotData) 01065 { 01066 // Assume we will still have data until proven otherwise. It will only 01067 // ever be false if this is the root and its empty. 01068 gotData = true; 01069 01070 // The current position is after the open bracket, so we need to read in 01071 // in the element name. 01072 int colonPosition; 01073 if (!fReaderMgr.getQName(fQNameBuf, &colonPosition)) 01074 { 01075 if (fQNameBuf.isEmpty()) 01076 emitError(XMLErrs::ExpectedElementName); 01077 else 01078 emitError(XMLErrs::InvalidElementName, fQNameBuf.getRawBuffer()); 01079 fReaderMgr.skipToChar(chOpenAngle); 01080 return false; 01081 } 01082 01083 // See if its the root element 01084 const bool isRoot = fElemStack.isEmpty(); 01085 01086 // Assume it won't be an empty tag 01087 bool isEmpty = false; 01088 01089 // Skip any whitespace after the name 01090 fReaderMgr.skipPastSpaces(); 01091 01092 // Lets try to look up the element 01093 const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer(); 01094 XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf); 01095 01096 if (!elemDecl) { 01097 if (!XMLString::compareNString(qnameRawBuf, XMLUni::fgXMLNSColonString, 6)) 01098 emitError(XMLErrs::NoXMLNSAsElementPrefix, qnameRawBuf); 01099 01100 if (fElementIndex < fElements->size()) { 01101 elemDecl = fElements->elementAt(fElementIndex); 01102 } 01103 else { 01104 elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl 01105 ( 01106 fGrammarPoolMemoryManager 01107 ); 01108 fElements->addElement(elemDecl); 01109 } 01110 01111 elemDecl->setElementName(qnameRawBuf, fEmptyNamespaceId); 01112 fElementLookup->put((void*)elemDecl->getFullName(), elemDecl); 01113 fElementIndex++; 01114 } 01115 01116 // Expand the element stack and add the new element 01117 fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum()); 01118 01119 // reset NS attribute list 01120 fAttrNSList->removeAllElements(); 01121 01122 // We loop until we either see a /> or >, handling attribute/value 01123 // pairs until we get there. 01124 XMLSize_t attCount = 0; 01125 XMLSize_t curAttListSize = fAttrList->size(); 01126 while (true) 01127 { 01128 // And get the next non-space character 01129 XMLCh nextCh = fReaderMgr.peekNextChar(); 01130 01131 // If the next character is not a slash or closed angle bracket, 01132 // then it must be whitespace, since whitespace is required 01133 // between the end of the last attribute and the name of the next 01134 // one. 01135 if (attCount) 01136 { 01137 if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle)) 01138 { 01139 bool bFoundSpace; 01140 fReaderMgr.skipPastSpaces(bFoundSpace); 01141 if (!bFoundSpace) 01142 { 01143 // Emit the error but keep on going 01144 emitError(XMLErrs::ExpectedWhitespace); 01145 } 01146 // Ok, peek another char 01147 nextCh = fReaderMgr.peekNextChar(); 01148 } 01149 } 01150 01151 // Ok, here we first check for any of the special case characters. 01152 // If its not one, then we do the normal case processing, which 01153 // assumes that we've hit an attribute value, Otherwise, we do all 01154 // the special case checks. 01155 if (!fReaderMgr.getCurrentReader()->isSpecialStartTagChar(nextCh)) 01156 { 01157 // Assume its going to be an attribute, so get a name from 01158 // the input. 01159 int colonPosition; 01160 if (!fReaderMgr.getQName(fAttNameBuf, &colonPosition)) 01161 { 01162 if (fAttNameBuf.isEmpty()) 01163 emitError(XMLErrs::ExpectedAttrName); 01164 else 01165 emitError(XMLErrs::InvalidAttrName, fAttNameBuf.getRawBuffer()); 01166 fReaderMgr.skipPastChar(chCloseAngle); 01167 return false; 01168 } 01169 01170 // And next must be an equal sign 01171 if (!scanEq()) 01172 { 01173 static const XMLCh tmpList[] = 01174 { 01175 chSingleQuote, chDoubleQuote, chCloseAngle 01176 , chOpenAngle, chForwardSlash, chNull 01177 }; 01178 01179 emitError(XMLErrs::ExpectedEqSign); 01180 01181 // Try to sync back up by skipping forward until we either 01182 // hit something meaningful. 01183 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01184 01185 if ((chFound == chCloseAngle) || (chFound == chForwardSlash)) 01186 { 01187 // Jump back to top for normal processing of these 01188 continue; 01189 } 01190 else if ((chFound == chSingleQuote) 01191 || (chFound == chDoubleQuote) 01192 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01193 { 01194 // Just fall through assuming that the value is to follow 01195 } 01196 else if (chFound == chOpenAngle) 01197 { 01198 // Assume a malformed tag and that new one is starting 01199 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 01200 return false; 01201 } 01202 else 01203 { 01204 // Something went really wrong 01205 return false; 01206 } 01207 } 01208 01209 // See if this attribute is declared more than one for this element. 01210 const XMLCh* attNameRawBuf = fAttNameBuf.getRawBuffer(); 01211 XMLSize_t attNameHash = XMLString::hash(attNameRawBuf, 109); 01212 if (attCount) { 01213 01214 for (XMLSize_t k=0; k < attCount; k++) { 01215 01216 if (fAttrNameHashList->elementAt(k) == attNameHash) { 01217 if (XMLString::equals( 01218 fAttrList->elementAt(k)->getQName() 01219 , attNameRawBuf)) 01220 { 01221 emitError 01222 ( 01223 XMLErrs::AttrAlreadyUsedInSTag 01224 , attNameRawBuf 01225 , qnameRawBuf 01226 ); 01227 break; 01228 } 01229 } 01230 } 01231 } 01232 01233 // Skip any whitespace before the value and then scan the att 01234 // value. This will come back normalized with entity refs and 01235 // char refs expanded. 01236 fReaderMgr.skipPastSpaces(); 01237 if (!scanAttValue(attNameRawBuf, fAttValueBuf)) 01238 { 01239 static const XMLCh tmpList[] = 01240 { 01241 chCloseAngle, chOpenAngle, chForwardSlash, chNull 01242 }; 01243 01244 emitError(XMLErrs::ExpectedAttrValue); 01245 01246 // It failed, so lets try to get synced back up. We skip 01247 // forward until we find some whitespace or one of the 01248 // chars in our list. 01249 const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList); 01250 01251 if ((chFound == chCloseAngle) 01252 || (chFound == chForwardSlash) 01253 || fReaderMgr.getCurrentReader()->isWhitespace(chFound)) 01254 { 01255 // Just fall through and process this attribute, though 01256 // the value will be "". 01257 } 01258 else if (chFound == chOpenAngle) 01259 { 01260 // Assume a malformed tag and that new one is starting 01261 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 01262 return false; 01263 } 01264 else 01265 { 01266 // Something went really wrong 01267 return false; 01268 } 01269 } 01270 01271 // Add this attribute to the attribute list that we use to 01272 // pass them to the handler. We reuse its existing elements 01273 // but expand it as required. 01274 const XMLCh* attValueRawBuf = fAttValueBuf.getRawBuffer(); 01275 XMLAttr* curAtt = 0; 01276 if (attCount >= curAttListSize) 01277 { 01278 curAtt = new (fMemoryManager) XMLAttr 01279 ( 01280 fEmptyNamespaceId 01281 , attNameRawBuf 01282 , attValueRawBuf 01283 , XMLAttDef::CData 01284 , true 01285 , fMemoryManager 01286 ); 01287 fAttrList->addElement(curAtt); 01288 fAttrNameHashList->addElement(attNameHash); 01289 } 01290 else 01291 { 01292 curAtt = fAttrList->elementAt(attCount); 01293 curAtt->set 01294 ( 01295 fEmptyNamespaceId 01296 , attNameRawBuf 01297 , attValueRawBuf 01298 ); 01299 curAtt->setSpecified(true); 01300 fAttrNameHashList->setElementAt(attNameHash, attCount); 01301 } 01302 01303 // Map prefix to namespace 01304 const XMLCh* attPrefix = curAtt->getPrefix(); 01305 const XMLCh* attLocalName = curAtt->getName(); 01306 const XMLCh* namespaceURI = fAttValueBuf.getRawBuffer(); 01307 01308 if (attPrefix && *attPrefix) { 01309 if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) { 01310 curAtt->setURIId(fXMLNamespaceId); 01311 } 01312 else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) { 01313 01314 if (XMLString::equals(attLocalName, XMLUni::fgXMLNSString)) 01315 emitError(XMLErrs::NoUseOfxmlnsAsPrefix); 01316 else if (XMLString::equals(attLocalName, XMLUni::fgXMLString)) { 01317 if (!XMLString::equals(namespaceURI, XMLUni::fgXMLURIName)) 01318 emitError(XMLErrs::PrefixXMLNotMatchXMLURI); 01319 } 01320 01321 if (!namespaceURI) 01322 emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf); 01323 else if(!*namespaceURI && fXMLVersion == XMLReader::XMLV1_0) 01324 emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf); 01325 01326 fElemStack.addPrefix 01327 ( 01328 attLocalName 01329 , fURIStringPool->addOrFind(namespaceURI) 01330 ); 01331 curAtt->setURIId(fXMLNSNamespaceId); 01332 } 01333 else { 01334 fAttrNSList->addElement(curAtt); 01335 } 01336 } 01337 else { 01338 if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) { 01339 01340 if (XMLString::equals(namespaceURI, XMLUni::fgXMLNSURIName)) 01341 emitError(XMLErrs::NoUseOfxmlnsURI); 01342 else if (XMLString::equals(namespaceURI, XMLUni::fgXMLURIName)) 01343 emitError(XMLErrs::XMLURINotMatchXMLPrefix); 01344 01345 fElemStack.addPrefix 01346 ( 01347 XMLUni::fgZeroLenString 01348 , fURIStringPool->addOrFind(namespaceURI) 01349 ); 01350 } 01351 } 01352 01353 // increment attribute count 01354 attCount++; 01355 01356 // And jump back to the top of the loop 01357 continue; 01358 } 01359 01360 // It was some special case character so do all of the checks and 01361 // deal with it. 01362 if (!nextCh) 01363 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01364 01365 if (nextCh == chForwardSlash) 01366 { 01367 fReaderMgr.getNextChar(); 01368 isEmpty = true; 01369 if (!fReaderMgr.skippedChar(chCloseAngle)) 01370 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 01371 break; 01372 } 01373 else if (nextCh == chCloseAngle) 01374 { 01375 fReaderMgr.getNextChar(); 01376 break; 01377 } 01378 else if (nextCh == chOpenAngle) 01379 { 01380 // Check for this one specially, since its going to be common 01381 // and it is kind of auto-recovering since we've already hit the 01382 // next open bracket, which is what we would have seeked to (and 01383 // skipped this whole tag.) 01384 emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf); 01385 break; 01386 } 01387 else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote)) 01388 { 01389 // Check for this one specially, which is probably a missing 01390 // attribute name, e.g. ="value". Just issue expected name 01391 // error and eat the quoted string, then jump back to the 01392 // top again. 01393 emitError(XMLErrs::ExpectedAttrName); 01394 fReaderMgr.getNextChar(); 01395 fReaderMgr.skipQuotedString(nextCh); 01396 fReaderMgr.skipPastSpaces(); 01397 continue; 01398 } 01399 } 01400 01401 // Handle provided attributes that we did not map their prefixes 01402 for (unsigned int i=0; i < fAttrNSList->size(); i++) { 01403 01404 XMLAttr* providedAttr = fAttrNSList->elementAt(i); 01405 01406 providedAttr->setURIId 01407 ( 01408 resolvePrefix 01409 ( 01410 providedAttr->getPrefix(), 01411 ElemStack::Mode_Attribute 01412 ) 01413 ); 01414 } 01415 01416 if(attCount) { 01417 01418 // 01419 // Decide if to use hash table to do duplicate checking 01420 // 01421 bool toUseHashTable = false; 01422 setAttrDupChkRegistry(attCount, toUseHashTable); 01423 01424 // check for duplicate namespace attributes: 01425 // by checking for qualified names with the same local part and with prefixes 01426 // which have been bound to namespace names that are identical. 01427 XMLAttr* loopAttr; 01428 XMLAttr* curAtt; 01429 for (unsigned int attrIndex=0; attrIndex < attCount-1; attrIndex++) { 01430 loopAttr = fAttrList->elementAt(attrIndex); 01431 01432 if (!toUseHashTable) 01433 { 01434 for (unsigned int curAttrIndex = attrIndex+1; curAttrIndex < attCount; curAttrIndex++) { 01435 curAtt = fAttrList->elementAt(curAttrIndex); 01436 if (curAtt->getURIId() == loopAttr->getURIId() && 01437 XMLString::equals(curAtt->getName(), loopAttr->getName())) { 01438 emitError 01439 ( 01440 XMLErrs::AttrAlreadyUsedInSTag 01441 , curAtt->getName() 01442 , elemDecl->getFullName() 01443 ); 01444 } 01445 } 01446 } 01447 else 01448 { 01449 if (fAttrDupChkRegistry->containsKey((void*)loopAttr->getName(), loopAttr->getURIId())) 01450 { 01451 emitError 01452 ( 01453 XMLErrs::AttrAlreadyUsedInSTag 01454 , loopAttr->getName() 01455 , elemDecl->getFullName() 01456 ); 01457 } 01458 01459 fAttrDupChkRegistry->put((void*)loopAttr->getName(), loopAttr->getURIId(), loopAttr); 01460 } 01461 } 01462 } 01463 01464 // Resolve the qualified name to a URI. 01465 unsigned int uriId = resolvePrefix 01466 ( 01467 elemDecl->getElementName()->getPrefix() 01468 , ElemStack::Mode_Element 01469 ); 01470 01471 // Now we can update the element stack 01472 fElemStack.setCurrentURI(uriId); 01473 01474 // Tell the document handler about this start tag 01475 if (fDocHandler) 01476 { 01477 fDocHandler->startElement 01478 ( 01479 *elemDecl 01480 , uriId 01481 , elemDecl->getElementName()->getPrefix() 01482 , *fAttrList 01483 , attCount 01484 , isEmpty 01485 , isRoot 01486 ); 01487 } 01488 01489 // If empty, validate content right now if we are validating and then 01490 // pop the element stack top. Else, we have to update the current stack 01491 // top's namespace mapping elements. 01492 if (isEmpty) 01493 { 01494 // Pop the element stack back off since it'll never be used now 01495 fElemStack.popTop(); 01496 01497 // If the elem stack is empty, then it was an empty root 01498 if (isRoot) 01499 gotData = false; 01500 } 01501 01502 return true; 01503 } 01504 01505 // --------------------------------------------------------------------------- 01506 // XMLScanner: Private parsing methods 01507 // --------------------------------------------------------------------------- 01508 bool WFXMLScanner::scanAttValue(const XMLCh* const attrName 01509 , XMLBuffer& toFill) 01510 { 01511 // Reset the target buffer 01512 toFill.reset(); 01513 01514 // Get the next char which must be a single or double quote 01515 XMLCh quoteCh; 01516 if (!fReaderMgr.skipIfQuote(quoteCh)) 01517 return false; 01518 01519 // We have to get the current reader because we have to ignore closing 01520 // quotes until we hit the same reader again. 01521 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); 01522 01523 // Loop until we get the attribute value. Note that we use a double 01524 // loop here to avoid the setup/teardown overhead of the exception 01525 // handler on every round. 01526 XMLCh nextCh; 01527 XMLCh secondCh = 0; 01528 bool gotLeadingSurrogate = false; 01529 bool escaped; 01530 while (true) 01531 { 01532 try 01533 { 01534 while(true) 01535 { 01536 nextCh = fReaderMgr.getNextChar(); 01537 01538 if (!nextCh) 01539 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01540 01541 // Check for our ending quote in the same entity 01542 if (nextCh == quoteCh) 01543 { 01544 if (curReader == fReaderMgr.getCurrentReaderNum()) 01545 return true; 01546 01547 // Watch for spillover into a previous entity 01548 if (curReader > fReaderMgr.getCurrentReaderNum()) 01549 { 01550 emitError(XMLErrs::PartialMarkupInEntity); 01551 return false; 01552 } 01553 } 01554 01555 // Check for an entity ref now, before we let it affect our 01556 // whitespace normalization logic below. We ignore the empty flag 01557 // in this one. 01558 escaped = false; 01559 if (nextCh == chAmpersand) 01560 { 01561 if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned) 01562 { 01563 gotLeadingSurrogate = false; 01564 continue; 01565 } 01566 } 01567 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 01568 { 01569 // Deal with surrogate pairs 01570 // Its a leading surrogate. If we already got one, then 01571 // issue an error, else set leading flag to make sure that 01572 // we look for a trailing next time. 01573 if (gotLeadingSurrogate) 01574 { 01575 emitError(XMLErrs::Expected2ndSurrogateChar); 01576 } 01577 else 01578 gotLeadingSurrogate = true; 01579 } 01580 else 01581 { 01582 // If its a trailing surrogate, make sure that we are 01583 // prepared for that. Else, its just a regular char so make 01584 // sure that we were not expected a trailing surrogate. 01585 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 01586 { 01587 // Its trailing, so make sure we were expecting it 01588 if (!gotLeadingSurrogate) 01589 emitError(XMLErrs::Unexpected2ndSurrogateChar); 01590 } 01591 else 01592 { 01593 // Its just a char, so make sure we were not expecting a 01594 // trailing surrogate. 01595 if (gotLeadingSurrogate) { 01596 emitError(XMLErrs::Expected2ndSurrogateChar); 01597 } 01598 // Its got to at least be a valid XML character 01599 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 01600 { 01601 XMLCh tmpBuf[9]; 01602 XMLString::binToText 01603 ( 01604 nextCh 01605 , tmpBuf 01606 , 8 01607 , 16 01608 , fMemoryManager 01609 ); 01610 emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); 01611 } 01612 } 01613 gotLeadingSurrogate = false; 01614 } 01615 01616 // If its not escaped, then make sure its not a < character, which 01617 // is not allowed in attribute values. 01618 if (!escaped) { 01619 if (nextCh == chOpenAngle) 01620 emitError(XMLErrs::BracketInAttrValue, attrName); 01621 else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) 01622 nextCh = chSpace; 01623 } 01624 01625 // Else add it to the buffer 01626 toFill.append(nextCh); 01627 01628 if (secondCh) 01629 { 01630 toFill.append(secondCh); 01631 secondCh=0; 01632 } 01633 } 01634 } 01635 catch(const EndOfEntityException&) 01636 { 01637 // Just eat it and continue. 01638 gotLeadingSurrogate = false; 01639 escaped = false; 01640 } 01641 } 01642 return true; 01643 } 01644 01645 01646 // This method scans a CDATA section. It collects the character into one 01647 // of the temp buffers and calls the document handler, if any, with the 01648 // characters. It assumes that the <![CDATA string has been scanned before 01649 // this call. 01650 void WFXMLScanner::scanCDSection() 01651 { 01652 static const XMLCh CDataClose[] = 01653 { 01654 chCloseSquare, chCloseAngle, chNull 01655 }; 01656 01657 // The next character should be the opening square bracket. If not 01658 // issue an error, but then try to recover by skipping any whitespace 01659 // and checking again. 01660 if (!fReaderMgr.skippedChar(chOpenSquare)) 01661 { 01662 emitError(XMLErrs::ExpectedOpenSquareBracket); 01663 fReaderMgr.skipPastSpaces(); 01664 01665 // If we still don't find it, then give up, else keep going 01666 if (!fReaderMgr.skippedChar(chOpenSquare)) 01667 return; 01668 } 01669 01670 // Get a buffer for this 01671 XMLBufBid bbCData(&fBufMgr); 01672 01673 // We just scan forward until we hit the end of CDATA section sequence. 01674 // CDATA is effectively a big escape mechanism so we don't treat markup 01675 // characters specially here. 01676 bool emittedError = false; 01677 bool gotLeadingSurrogate = false; 01678 while (true) 01679 { 01680 const XMLCh nextCh = fReaderMgr.getNextChar(); 01681 01682 // Watch for unexpected end of file 01683 if (!nextCh) 01684 { 01685 emitError(XMLErrs::UnterminatedCDATASection); 01686 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01687 } 01688 01689 // If this is a close square bracket it could be our closing 01690 // sequence. 01691 if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose)) 01692 { 01693 // make sure we were not expecting a trailing surrogate. 01694 if (gotLeadingSurrogate) 01695 emitError(XMLErrs::Expected2ndSurrogateChar); 01696 01697 // If we have a doc handler, call it 01698 if (fDocHandler) 01699 { 01700 fDocHandler->docCharacters 01701 ( 01702 bbCData.getRawBuffer() 01703 , bbCData.getLen() 01704 , true 01705 ); 01706 } 01707 01708 // And we are done 01709 break; 01710 } 01711 01712 // Make sure its a valid character. But if we've emitted an error 01713 // already, don't bother with the overhead since we've already told 01714 // them about it. 01715 if (!emittedError) 01716 { 01717 // Deal with surrogate pairs 01718 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 01719 { 01720 // Its a leading surrogate. If we already got one, then 01721 // issue an error, else set leading flag to make sure that 01722 // we look for a trailing next time. 01723 if (gotLeadingSurrogate) 01724 emitError(XMLErrs::Expected2ndSurrogateChar); 01725 else 01726 gotLeadingSurrogate = true; 01727 } 01728 else 01729 { 01730 // If its a trailing surrogate, make sure that we are 01731 // prepared for that. Else, its just a regular char so make 01732 // sure that we were not expected a trailing surrogate. 01733 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 01734 { 01735 // Its trailing, so make sure we were expecting it 01736 if (!gotLeadingSurrogate) 01737 emitError(XMLErrs::Unexpected2ndSurrogateChar); 01738 } 01739 else 01740 { 01741 // Its just a char, so make sure we were not expecting a 01742 // trailing surrogate. 01743 if (gotLeadingSurrogate) 01744 emitError(XMLErrs::Expected2ndSurrogateChar); 01745 01746 // Its got to at least be a valid XML character 01747 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 01748 { 01749 XMLCh tmpBuf[9]; 01750 XMLString::binToText 01751 ( 01752 nextCh 01753 , tmpBuf 01754 , 8 01755 , 16 01756 , fMemoryManager 01757 ); 01758 emitError(XMLErrs::InvalidCharacter, tmpBuf); 01759 emittedError = true; 01760 } 01761 } 01762 gotLeadingSurrogate = false; 01763 } 01764 } 01765 01766 // Add it to the buffer 01767 bbCData.append(nextCh); 01768 } 01769 } 01770 01771 01772 void WFXMLScanner::scanCharData(XMLBuffer& toUse) 01773 { 01774 // We have to watch for the stupid ]]> sequence, which is illegal in 01775 // character data. So this is a little state machine that handles that. 01776 enum States 01777 { 01778 State_Waiting 01779 , State_GotOne 01780 , State_GotTwo 01781 }; 01782 01783 // Reset the buffer before we start 01784 toUse.reset(); 01785 01786 // Turn on the 'throw at end' flag of the reader manager 01787 ThrowEOEJanitor jan(&fReaderMgr, true); 01788 01789 // In order to be more efficient we have to use kind of a deeply nested 01790 // set of blocks here. The outer block puts on a try and catches end of 01791 // entity exceptions. The inner loop is the per-character loop. If we 01792 // put the try inside the inner loop, it would work but would require 01793 // the exception handling code setup/teardown code to be invoked for 01794 // each character. 01795 XMLCh nextCh; 01796 XMLCh secondCh = 0; 01797 States curState = State_Waiting; 01798 bool escaped = false; 01799 bool gotLeadingSurrogate = false; 01800 bool notDone = true; 01801 while (notDone) 01802 { 01803 try 01804 { 01805 while (true) 01806 { 01807 // Eat through as many plain content characters as possible without 01808 // needing special handling. Moving most content characters here, 01809 // in this one call, rather than running the overall loop once 01810 // per content character, is a speed optimization. 01811 if (curState == State_Waiting && !gotLeadingSurrogate) 01812 { 01813 fReaderMgr.movePlainContentChars(toUse); 01814 } 01815 01816 // Try to get another char from the source 01817 // The code from here on down covers all contengencies, 01818 if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) 01819 { 01820 // If we were waiting for a trailing surrogate, its an error 01821 if (gotLeadingSurrogate) 01822 emitError(XMLErrs::Expected2ndSurrogateChar); 01823 01824 notDone = false; 01825 break; 01826 } 01827 01828 // Watch for a reference. Note that the escapement mechanism 01829 // is ignored in this content. 01830 escaped = false; 01831 if (nextCh == chAmpersand) 01832 { 01833 sendCharData(toUse); 01834 01835 // Turn off the throwing at the end of entity during this 01836 ThrowEOEJanitor jan(&fReaderMgr, false); 01837 01838 if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned) 01839 { 01840 gotLeadingSurrogate = false; 01841 continue; 01842 } 01843 } 01844 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 01845 { 01846 // Deal with surrogate pairs 01847 // Its a leading surrogate. If we already got one, then 01848 // issue an error, else set leading flag to make sure that 01849 // we look for a trailing next time. 01850 if (gotLeadingSurrogate) 01851 { 01852 emitError(XMLErrs::Expected2ndSurrogateChar); 01853 } 01854 else 01855 gotLeadingSurrogate = true; 01856 } 01857 else 01858 { 01859 // If its a trailing surrogate, make sure that we are 01860 // prepared for that. Else, its just a regular char so make 01861 // sure that we were not expected a trailing surrogate. 01862 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 01863 { 01864 // Its trailing, so make sure we were expecting it 01865 if (!gotLeadingSurrogate) 01866 emitError(XMLErrs::Unexpected2ndSurrogateChar); 01867 } 01868 else 01869 { 01870 // Its just a char, so make sure we were not expecting a 01871 // trailing surrogate. 01872 if (gotLeadingSurrogate) { 01873 emitError(XMLErrs::Expected2ndSurrogateChar); 01874 } 01875 // Its got to at least be a valid XML character 01876 else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) 01877 { 01878 XMLCh tmpBuf[9]; 01879 XMLString::binToText 01880 ( 01881 nextCh 01882 , tmpBuf 01883 , 8 01884 , 16 01885 , fMemoryManager 01886 ); 01887 emitError(XMLErrs::InvalidCharacter, tmpBuf); 01888 } 01889 } 01890 gotLeadingSurrogate = false; 01891 } 01892 01893 // Keep the state machine up to date 01894 if (!escaped) 01895 { 01896 if (nextCh == chCloseSquare) 01897 { 01898 if (curState == State_Waiting) 01899 curState = State_GotOne; 01900 else if (curState == State_GotOne) 01901 curState = State_GotTwo; 01902 } 01903 else if (nextCh == chCloseAngle) 01904 { 01905 if (curState == State_GotTwo) 01906 emitError(XMLErrs::BadSequenceInCharData); 01907 curState = State_Waiting; 01908 } 01909 else 01910 { 01911 curState = State_Waiting; 01912 } 01913 } 01914 else 01915 { 01916 curState = State_Waiting; 01917 } 01918 01919 // Add this char to the buffer 01920 toUse.append(nextCh); 01921 01922 if (secondCh) 01923 { 01924 toUse.append(secondCh); 01925 secondCh=0; 01926 } 01927 } 01928 } 01929 catch(const EndOfEntityException& toCatch) 01930 { 01931 // Some entity ended, so we have to send any accumulated 01932 // chars and send an end of entity event. 01933 sendCharData(toUse); 01934 gotLeadingSurrogate = false; 01935 01936 if (fDocHandler) 01937 fDocHandler->endEntityReference(toCatch.getEntity()); 01938 } 01939 } 01940 01941 // Send any char data that we accumulated into the buffer 01942 sendCharData(toUse); 01943 } 01944 01945 InputSource* WFXMLScanner::resolveSystemId(const XMLCh* const /*sysId*/ 01946 ,const XMLCh* const /*pubId*/) 01947 { 01948 return 0; 01949 } 01950 01951 // This method will scan a general/character entity ref. It will either 01952 // expand a char ref and return it directly, or push a reader for a general 01953 // entity. 01954 // 01955 // The return value indicates whether the char parameters hold the value 01956 // or whether the value was pushed as a reader, or that it failed. 01957 // 01958 // The escaped flag tells the caller whether the returned parameter resulted 01959 // from a character reference, which escapes the character in some cases. It 01960 // only makes any difference if the return value indicates the value was 01961 // returned directly. 01962 XMLScanner::EntityExpRes 01963 WFXMLScanner::scanEntityRef(const bool 01964 , XMLCh& firstCh 01965 , XMLCh& secondCh 01966 , bool& escaped) 01967 { 01968 // Assume no escape 01969 secondCh = 0; 01970 escaped = false; 01971 01972 // We have to insure that its all in one entity 01973 const XMLSize_t curReader = fReaderMgr.getCurrentReaderNum(); 01974 01975 // If the next char is a pound, then its a character reference and we 01976 // need to expand it always. 01977 if (fReaderMgr.skippedChar(chPound)) 01978 { 01979 // Its a character reference, so scan it and get back the numeric 01980 // value it represents. 01981 if (!scanCharRef(firstCh, secondCh)) 01982 return EntityExp_Failed; 01983 01984 escaped = true; 01985 01986 if (curReader != fReaderMgr.getCurrentReaderNum()) 01987 emitError(XMLErrs::PartialMarkupInEntity); 01988 01989 return EntityExp_Returned; 01990 } 01991 01992 // Expand it since its a normal entity ref 01993 XMLBufBid bbName(&fBufMgr); 01994 if (!fReaderMgr.getName(bbName.getBuffer())) 01995 { 01996 emitError(XMLErrs::ExpectedEntityRefName); 01997 return EntityExp_Failed; 01998 } 01999 02000 // Next char must be a semi-colon. But if its not, just emit 02001 // an error and try to continue. 02002 if (!fReaderMgr.skippedChar(chSemiColon)) 02003 emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); 02004 02005 // Make sure we ended up on the same entity reader as the & char 02006 if (curReader != fReaderMgr.getCurrentReaderNum()) 02007 emitError(XMLErrs::PartialMarkupInEntity); 02008 02009 // Look up the name in the general entity pool 02010 // If it does not exist, then obviously an error 02011 if (!fEntityTable->containsKey(bbName.getRawBuffer())) 02012 { 02013 // XML 1.0 Section 4.1 02014 // Well-formedness Constraint for entity not found: 02015 // In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references, 02016 // or a document with "standalone='yes'", for an entity reference that does not occur within the external subset 02017 // or a parameter entity 02018 if (fStandalone || fHasNoDTD) 02019 emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); 02020 02021 return EntityExp_Failed; 02022 } 02023 02024 // here's where we need to check if there's a SecurityManager, 02025 // how many entity references we've had 02026 if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) { 02027 XMLCh expLimStr[32]; 02028 XMLString::sizeToText(fEntityExpansionLimit, expLimStr, 31, 10, fMemoryManager); 02029 emitError 02030 ( 02031 XMLErrs::EntityExpansionLimitExceeded 02032 , expLimStr 02033 ); 02034 // there seems nothing better to be done than to reset the entity expansion counter 02035 fEntityExpansionCount = 0; 02036 } 02037 02038 firstCh = fEntityTable->get(bbName.getRawBuffer()); 02039 escaped = true; 02040 return EntityExp_Returned; 02041 } 02042 02043 // --------------------------------------------------------------------------- 02044 // WFXMLScanner: Grammar preparsing 02045 // --------------------------------------------------------------------------- 02046 Grammar* WFXMLScanner::loadGrammar(const InputSource& 02047 , const short 02048 , const bool) 02049 { 02050 // REVISIT: emit a warning or throw an exception 02051 return 0; 02052 } 02053 02054 02055 XERCES_CPP_NAMESPACE_END