GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: DTDScanner.cpp 833045 2009-11-05 13:21:27Z borisk $ 00020 */ 00021 00022 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/util/BinMemInputStream.hpp> 00027 #include <xercesc/util/FlagJanitor.hpp> 00028 #include <xercesc/util/Janitor.hpp> 00029 #include <xercesc/util/XMLUniDefs.hpp> 00030 #include <xercesc/util/ValueStackOf.hpp> 00031 #include <xercesc/util/UnexpectedEOFException.hpp> 00032 #include <xercesc/util/OutOfMemoryException.hpp> 00033 #include <xercesc/sax/InputSource.hpp> 00034 #include <xercesc/framework/XMLDocumentHandler.hpp> 00035 #include <xercesc/framework/XMLEntityHandler.hpp> 00036 #include <xercesc/framework/XMLValidator.hpp> 00037 #include <xercesc/internal/EndOfEntityException.hpp> 00038 #include <xercesc/internal/XMLScanner.hpp> 00039 #include <xercesc/validators/common/ContentSpecNode.hpp> 00040 #include <xercesc/validators/common/MixedContentModel.hpp> 00041 #include <xercesc/validators/DTD/DTDEntityDecl.hpp> 00042 #include <xercesc/validators/DTD/DocTypeHandler.hpp> 00043 #include <xercesc/validators/DTD/DTDScanner.hpp> 00044 00045 XERCES_CPP_NAMESPACE_BEGIN 00046 00047 // --------------------------------------------------------------------------- 00048 // Local methods 00049 // --------------------------------------------------------------------------- 00050 // 00051 // This method automates the grunt work of looking at a char and see if its 00052 // a repetition suffix. If so, it creates a new correct rep node and wraps 00053 // the pass node in it. Otherwise, it returns the previous node. 00054 // 00055 static ContentSpecNode* makeRepNode(const XMLCh testCh, 00056 ContentSpecNode* const prevNode, 00057 MemoryManager* const manager) 00058 { 00059 if (testCh == chQuestion) 00060 { 00061 return new (manager) ContentSpecNode 00062 ( 00063 ContentSpecNode::ZeroOrOne 00064 , prevNode 00065 , 0 00066 , true 00067 , true 00068 , manager 00069 ); 00070 } 00071 else if (testCh == chPlus) 00072 { 00073 return new (manager) ContentSpecNode 00074 ( 00075 ContentSpecNode::OneOrMore 00076 , prevNode 00077 , 0 00078 , true 00079 , true 00080 , manager 00081 ); 00082 } 00083 else if (testCh == chAsterisk) 00084 { 00085 return new (manager) ContentSpecNode 00086 ( 00087 ContentSpecNode::ZeroOrMore 00088 , prevNode 00089 , 0 00090 , true 00091 , true 00092 , manager 00093 ); 00094 } 00095 00096 // Just return the incoming node 00097 return prevNode; 00098 } 00099 00100 // --------------------------------------------------------------------------- 00101 // DTDValidator: Constructors and Destructor 00102 // --------------------------------------------------------------------------- 00103 DTDScanner::DTDScanner( DTDGrammar* dtdGrammar 00104 , DocTypeHandler* const docTypeHandler 00105 , MemoryManager* const grammarPoolMemoryManager 00106 , MemoryManager* const manager) : 00107 fMemoryManager(manager) 00108 , fGrammarPoolMemoryManager(grammarPoolMemoryManager) 00109 , fDocTypeHandler(docTypeHandler) 00110 , fDumAttDef(0) 00111 , fDumElemDecl(0) 00112 , fDumEntityDecl(0) 00113 , fInternalSubset(false) 00114 , fNextAttrId(1) 00115 , fDTDGrammar(dtdGrammar) 00116 , fBufMgr(0) 00117 , fReaderMgr(0) 00118 , fScanner(0) 00119 , fPEntityDeclPool(0) 00120 , fEmptyNamespaceId(0) 00121 , fDocTypeReaderId(0) 00122 { 00123 fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager); 00124 } 00125 00126 DTDScanner::~DTDScanner() 00127 { 00128 delete fDumAttDef; 00129 delete fDumElemDecl; 00130 delete fDumEntityDecl; 00131 delete fPEntityDeclPool; 00132 } 00133 00134 // ----------------------------------------------------------------------- 00135 // Setter methods 00136 // ----------------------------------------------------------------------- 00137 void DTDScanner::setScannerInfo(XMLScanner* const owningScanner 00138 , ReaderMgr* const readerMgr 00139 , XMLBufferMgr* const bufMgr) 00140 { 00141 // We don't own any of these, we just reference them 00142 fScanner = owningScanner; 00143 fReaderMgr = readerMgr; 00144 fBufMgr = bufMgr; 00145 00146 if (fScanner->getDoNamespaces()) 00147 fEmptyNamespaceId = fScanner->getEmptyNamespaceId(); 00148 else 00149 fEmptyNamespaceId = 0; 00150 00151 fDocTypeReaderId = fReaderMgr->getCurrentReaderNum(); 00152 } 00153 00154 00155 // --------------------------------------------------------------------------- 00156 // DTDScanner: Private scanning methods 00157 // --------------------------------------------------------------------------- 00158 bool DTDScanner::checkForPERef( const bool inLiteral 00159 , const bool inMarkup) 00160 { 00161 bool gotSpace = false; 00162 00163 // 00164 // See if we have any spaces up front. If so, then skip them and set 00165 // the gotSpaces flag. 00166 // 00167 if (fReaderMgr->skippedSpace()) 00168 { 00169 fReaderMgr->skipPastSpaces(); 00170 gotSpace = true; 00171 } 00172 00173 // If the next char is a percent, then expand the PERef 00174 if (!fReaderMgr->skippedChar(chPercent)) 00175 return gotSpace; 00176 00177 while (true) 00178 { 00179 if (!expandPERef(false, inLiteral, inMarkup, false)) 00180 fScanner->emitError(XMLErrs::ExpectedEntityRefName); 00181 // And skip any more spaces in the expanded value 00182 if (fReaderMgr->skippedSpace()) 00183 { 00184 fReaderMgr->skipPastSpaces(); 00185 gotSpace = true; 00186 } 00187 if (!fReaderMgr->skippedChar(chPercent)) 00188 break; 00189 } 00190 return gotSpace; 00191 } 00192 00193 00194 bool DTDScanner::expandPERef( const bool scanExternal 00195 , const bool inLiteral 00196 , const bool inMarkup 00197 , const bool throwEndOfExt) 00198 { 00199 fScanner->setHasNoDTD(false); 00200 XMLBufBid bbName(fBufMgr); 00201 00202 // 00203 // If we are in the internal subset and in markup, then this is 00204 // an error but we go ahead and do it anyway. 00205 // 00206 if (fInternalSubset && inMarkup) 00207 fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset); 00208 00209 if (!fReaderMgr->getName(bbName.getBuffer())) 00210 { 00211 fScanner->emitError(XMLErrs::ExpectedPEName); 00212 00213 // Skip the semicolon if that's what we ended up on 00214 fReaderMgr->skippedChar(chSemiColon); 00215 return false; 00216 } 00217 00218 // If no terminating semicolon, emit an error but try to keep going 00219 if (!fReaderMgr->skippedChar(chSemiColon)) 00220 fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); 00221 00222 // 00223 // Look it up in the PE decl pool and see if it exists. If not, just 00224 // emit an error and continue. 00225 // 00226 XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer()); 00227 if (!decl) 00228 { 00229 // XML 1.0 Section 4.1 00230 if (fScanner->getStandalone()) { 00231 // no need to check fScanner->fHasNoDTD which is for sure false 00232 // since we are in expandPERef already 00233 fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); 00234 } 00235 else { 00236 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 00237 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer()); 00238 } 00239 00240 return false; 00241 } 00242 00243 // 00244 // XML 1.0 Section 2.9 00245 // If we are a standalone document, then it has to have been declared 00246 // in the internal subset. Keep going though. 00247 // 00248 if (fScanner->getValidationScheme() == XMLScanner::Val_Always && fScanner->getStandalone() && !decl->getDeclaredInIntSubset()) 00249 fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer()); 00250 00251 // 00252 // Okee dokee, we found it. So create either a memory stream with 00253 // the entity value contents, or a file stream if its an external 00254 // entity. 00255 // 00256 if (decl->isExternal()) 00257 { 00258 // And now create a reader to read this entity 00259 InputSource* srcUsed; 00260 XMLReader* reader = fReaderMgr->createReader 00261 ( 00262 decl->getBaseURI() 00263 , decl->getSystemId() 00264 , decl->getPublicId() 00265 , false 00266 , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral 00267 , XMLReader::Type_PE 00268 , XMLReader::Source_External 00269 , srcUsed 00270 , fScanner->getCalculateSrcOfs() 00271 , fScanner->getLowWaterMark() 00272 , fScanner->getDisableDefaultEntityResolution() 00273 ); 00274 00275 // Put a janitor on the source so its cleaned up on exit 00276 Janitor<InputSource> janSrc(srcUsed); 00277 00278 // If the creation failed then throw an exception 00279 if (!reader) 00280 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager); 00281 00282 // Set the 'throw at end' flag, to the one we were given 00283 reader->setThrowAtEnd(throwEndOfExt); 00284 00285 // 00286 // Push the reader. If its a recursive expansion, then emit an error 00287 // and return an failure. 00288 // 00289 if (!fReaderMgr->pushReader(reader, decl)) 00290 { 00291 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName()); 00292 return false; 00293 } 00294 00295 // 00296 // If the caller wants us to scan the external entity, then lets 00297 // do that now. 00298 // 00299 if (scanExternal) 00300 { 00301 XMLEntityHandler* entHandler = fScanner->getEntityHandler(); 00302 00303 // If we have an entity handler, tell it we are starting this entity 00304 if (entHandler) 00305 entHandler->startInputSource(*srcUsed); 00306 00307 // 00308 // Scan the external entity now. The parameter tells it that 00309 // it is not in an include section. Get the current reader 00310 // level so we can catch partial markup errors and be sure 00311 // to get back to here if we get an exception out of the 00312 // ext subset scan. 00313 // 00314 const XMLSize_t readerNum = fReaderMgr->getCurrentReaderNum(); 00315 try 00316 { 00317 scanExtSubsetDecl(false, false); 00318 } 00319 catch(const OutOfMemoryException&) 00320 { 00321 throw; 00322 } 00323 catch(...) 00324 { 00325 // Pop the reader back to the original level 00326 fReaderMgr->cleanStackBackTo(readerNum); 00327 00328 // End the input source, even though its not happy 00329 if (entHandler) 00330 entHandler->endInputSource(*srcUsed); 00331 throw; 00332 } 00333 00334 // If we have an entity handler, tell it we are ending this entity 00335 if (entHandler) 00336 entHandler->endInputSource(*srcUsed); 00337 } 00338 else { 00339 // If it starts with the XML string, then parse a text decl 00340 if (fScanner->checkXMLDecl(true)) 00341 scanTextDecl(); 00342 } 00343 } 00344 else 00345 { 00346 // Create a reader over a memory stream over the entity value 00347 XMLReader* valueReader = fReaderMgr->createIntEntReader 00348 ( 00349 decl->getName() 00350 , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral 00351 , XMLReader::Type_PE 00352 , decl->getValue() 00353 , decl->getValueLen() 00354 , false 00355 ); 00356 00357 // 00358 // Trt to push the entity reader onto the reader manager stack, 00359 // where it will become the subsequent input. If it fails, that 00360 // means the entity is recursive, so issue an error. The reader 00361 // will have just been discarded, but we just keep going. 00362 // 00363 if (!fReaderMgr->pushReader(valueReader, decl)) 00364 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName()); 00365 } 00366 00367 return true; 00368 } 00369 00370 00371 bool DTDScanner::getQuotedString(XMLBuffer& toFill) 00372 { 00373 // Reset the target buffer 00374 toFill.reset(); 00375 00376 // Get the next char which must be a single or double quote 00377 XMLCh quoteCh; 00378 if (!fReaderMgr->skipIfQuote(quoteCh)) 00379 return false; 00380 00381 XMLCh nextCh; 00382 // Get another char and see if it matches the starting quote char 00383 while ((nextCh=fReaderMgr->getNextChar())!=quoteCh) 00384 { 00385 // 00386 // We should never get either an end of file null char here. If we 00387 // do, just fail. It will be handled more gracefully in the higher 00388 // level code that called us. 00389 // 00390 if (!nextCh) 00391 return false; 00392 00393 // Else add it to the buffer 00394 toFill.append(nextCh); 00395 } 00396 return true; 00397 } 00398 00399 00400 XMLAttDef* 00401 DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse) 00402 { 00403 // Check for PE ref or optional whitespace 00404 checkForPERef(false, true); 00405 00406 // Get the name of the attribute 00407 if (!fReaderMgr->getName(bufToUse)) 00408 { 00409 fScanner->emitError(XMLErrs::ExpectedAttrName); 00410 return 0; 00411 } 00412 00413 // 00414 // Look up this attribute in the parent element's attribute list. If 00415 // it already exists, then use the dummy. 00416 // 00417 DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer()); 00418 if (decl) 00419 { 00420 // It already exists, so put out a warning 00421 fScanner->emitError 00422 ( 00423 XMLErrs::AttListAlreadyExists 00424 , bufToUse.getRawBuffer() 00425 , parentElem.getFullName() 00426 ); 00427 00428 // Use the dummy decl to parse into and set its name to the name we got 00429 if (!fDumAttDef) 00430 { 00431 fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager); 00432 fDumAttDef->setId(fNextAttrId++); 00433 } 00434 fDumAttDef->setName(bufToUse.getRawBuffer()); 00435 decl = fDumAttDef; 00436 } 00437 else 00438 { 00439 // 00440 // It does not already exist so create a new one, give it the next 00441 // available unique id, and add it 00442 // 00443 decl = new (fGrammarPoolMemoryManager) DTDAttDef 00444 ( 00445 bufToUse.getRawBuffer() 00446 , XMLAttDef::CData 00447 , XMLAttDef::Implied 00448 , fGrammarPoolMemoryManager 00449 ); 00450 decl->setId(fNextAttrId++); 00451 decl->setExternalAttDeclaration(isReadingExternalEntity()); 00452 parentElem.addAttDef(decl); 00453 } 00454 00455 // Set a flag to indicate whether we are doing a dummy parse 00456 const bool isIgnored = (decl == fDumAttDef); 00457 00458 // Space is required here, so check for PE ref, and require space 00459 if (!checkForPERef(false, true)) 00460 fScanner->emitError(XMLErrs::ExpectedWhitespace); 00461 00462 // 00463 // Next has to be one of the attribute type strings. This tells us what 00464 // is to follow. 00465 // 00466 if (fReaderMgr->skippedString(XMLUni::fgCDATAString)) 00467 { 00468 decl->setType(XMLAttDef::CData); 00469 } 00470 else if (fReaderMgr->skippedString(XMLUni::fgIDString)) 00471 { 00472 if (!fReaderMgr->skippedString(XMLUni::fgRefString)) 00473 decl->setType(XMLAttDef::ID); 00474 else if (!fReaderMgr->skippedChar(chLatin_S)) 00475 decl->setType(XMLAttDef::IDRef); 00476 else 00477 decl->setType(XMLAttDef::IDRefs); 00478 } 00479 else if (fReaderMgr->skippedString(XMLUni::fgEntitString)) 00480 { 00481 if (fReaderMgr->skippedChar(chLatin_Y)) 00482 { 00483 decl->setType(XMLAttDef::Entity); 00484 } 00485 else if (fReaderMgr->skippedString(XMLUni::fgIESString)) 00486 { 00487 decl->setType(XMLAttDef::Entities); 00488 } 00489 else 00490 { 00491 fScanner->emitError 00492 ( 00493 XMLErrs::ExpectedAttributeType 00494 , decl->getFullName() 00495 , parentElem.getFullName() 00496 ); 00497 return 0; 00498 } 00499 } 00500 else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString)) 00501 { 00502 if (fReaderMgr->skippedChar(chLatin_S)) 00503 decl->setType(XMLAttDef::NmTokens); 00504 else 00505 decl->setType(XMLAttDef::NmToken); 00506 } 00507 else if (fReaderMgr->skippedString(XMLUni::fgNotationString)) 00508 { 00509 // Check for PE ref and require space 00510 if (!checkForPERef(false, true)) 00511 fScanner->emitError(XMLErrs::ExpectedWhitespace); 00512 00513 decl->setType(XMLAttDef::Notation); 00514 if (!scanEnumeration(*decl, bufToUse, true)) 00515 return 0; 00516 00517 // Set the value as the enumeration for this decl 00518 decl->setEnumeration(bufToUse.getRawBuffer()); 00519 } 00520 else if (fReaderMgr->skippedChar(chOpenParen)) 00521 { 00522 decl->setType(XMLAttDef::Enumeration); 00523 if (!scanEnumeration(*decl, bufToUse, false)) 00524 return 0; 00525 00526 // Set the value as the enumeration for this decl 00527 decl->setEnumeration(bufToUse.getRawBuffer()); 00528 } 00529 else 00530 { 00531 fScanner->emitError 00532 ( 00533 XMLErrs::ExpectedAttributeType 00534 , decl->getFullName() 00535 , parentElem.getFullName() 00536 ); 00537 return 0; 00538 } 00539 00540 // Space is required here, so check for PE ref, and require space 00541 if (!checkForPERef(false, true)) 00542 fScanner->emitError(XMLErrs::ExpectedWhitespace); 00543 00544 // And then scan for the optional default value declaration 00545 scanDefaultDecl(*decl); 00546 00547 // If validating, then do a couple of validation constraints 00548 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 00549 { 00550 if (decl->getType() == XMLAttDef::ID) 00551 { 00552 if ((decl->getDefaultType() != XMLAttDef::Implied) 00553 && (decl->getDefaultType() != XMLAttDef::Required)) 00554 { 00555 fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName()); 00556 } 00557 } 00558 00559 // if attdef is xml:space, check correct enumeration (default|preserve) 00560 const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull }; 00561 00562 if (XMLString::equals(decl->getFullName(),fgXMLSpace)) { 00563 const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull }; 00564 const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull }; 00565 bool ok = false; 00566 if (decl->getType() == XMLAttDef::Enumeration) { 00567 BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration(), fMemoryManager); 00568 XMLSize_t size = enumVector->size(); 00569 ok = (size == 1 && 00570 (XMLString::equals(enumVector->elementAt(0), fgDefault) || 00571 XMLString::equals(enumVector->elementAt(0), fgPreserve))) || 00572 (size == 2 && 00573 (XMLString::equals(enumVector->elementAt(0), fgDefault) && 00574 XMLString::equals(enumVector->elementAt(1), fgPreserve))) || 00575 (size == 2 && 00576 (XMLString::equals(enumVector->elementAt(1), fgDefault) && 00577 XMLString::equals(enumVector->elementAt(0), fgPreserve))); 00578 delete enumVector; 00579 } 00580 if (!ok) 00581 fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace); 00582 } 00583 } 00584 00585 // If we have a doc type handler, tell it about this attdef. 00586 if (fDocTypeHandler) 00587 fDocTypeHandler->attDef(parentElem, *decl, isIgnored); 00588 return decl; 00589 } 00590 00591 00592 void DTDScanner::scanAttListDecl() 00593 { 00594 // Space is required here, so check for a PE ref 00595 if (!checkForPERef(false, true)) 00596 { 00597 fScanner->emitError(XMLErrs::ExpectedWhitespace); 00598 fReaderMgr->skipPastChar(chCloseAngle); 00599 return; 00600 } 00601 00602 // 00603 // Next should be the name of the element it belongs to, so get a buffer 00604 // and get the name into it. 00605 // 00606 XMLBufBid bbName(fBufMgr); 00607 if (!fReaderMgr->getName(bbName.getBuffer())) 00608 { 00609 fScanner->emitError(XMLErrs::ExpectedElementName); 00610 fReaderMgr->skipPastChar(chCloseAngle); 00611 return; 00612 } 00613 00614 // 00615 // Find this element's declaration. If it has not been declared yet, 00616 // we will force one into the list, but not mark it as declared. 00617 // 00618 DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE); 00619 if (!elemDecl) 00620 { 00621 // 00622 // Lets fault in a declaration and add it to the pool. We mark 00623 // it having been created because of an attlist. Later, if its 00624 // declared, this will be updated. 00625 // 00626 elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl 00627 ( 00628 bbName.getRawBuffer() 00629 , fEmptyNamespaceId 00630 , DTDElementDecl::Any 00631 , fGrammarPoolMemoryManager 00632 ); 00633 elemDecl->setCreateReason(XMLElementDecl::AttList); 00634 elemDecl->setExternalElemDeclaration(isReadingExternalEntity()); 00635 fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl); 00636 } 00637 00638 // If we have a doc type handler, tell it the att list is starting 00639 if (fDocTypeHandler) 00640 fDocTypeHandler->startAttList(*elemDecl); 00641 00642 // 00643 // Now we loop until we are done with all of the attributes in this 00644 // list. We need a buffer to use for local processing. 00645 // 00646 XMLBufBid bbTmp(fBufMgr); 00647 XMLBuffer& tmpBuf = bbTmp.getBuffer(); 00648 bool seenAnId = false; 00649 while (true) 00650 { 00651 // Get the next char out and see what it tells us to do 00652 const XMLCh nextCh = fReaderMgr->peekNextChar(); 00653 00654 // Watch for EOF 00655 if (!nextCh) 00656 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 00657 00658 if (nextCh == chCloseAngle) 00659 { 00660 // We are done with this attribute list 00661 fReaderMgr->getNextChar(); 00662 break; 00663 } 00664 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh)) 00665 { 00666 // 00667 // If advanced callbacks are enabled and we have a doc 00668 // type handler, then gather up the white space and call 00669 // back on the doctype handler. Otherwise, just skip 00670 // whitespace. 00671 // 00672 if (fDocTypeHandler) 00673 { 00674 fReaderMgr->getSpaces(tmpBuf); 00675 fDocTypeHandler->doctypeWhitespace 00676 ( 00677 tmpBuf.getRawBuffer() 00678 , tmpBuf.getLen() 00679 ); 00680 } 00681 else 00682 { 00683 fReaderMgr->skipPastSpaces(); 00684 } 00685 } 00686 else if (nextCh == chPercent) 00687 { 00688 // Eat the percent and expand the ref 00689 fReaderMgr->getNextChar(); 00690 expandPERef(false, false, true); 00691 } 00692 else 00693 { 00694 // 00695 // It must be an attribute name, so scan it. We let 00696 // it use our local buffer for its name scanning. 00697 // 00698 XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf); 00699 00700 if (!attDef) 00701 { 00702 fReaderMgr->skipPastChar(chCloseAngle); 00703 break; 00704 } 00705 00706 // 00707 // If we are validating and its an ID type, then we have to 00708 // make sure that we have not seen an id attribute yet. Set 00709 // the flag to say that we've seen one now also. 00710 // 00711 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 00712 { 00713 if (attDef->getType() == XMLAttDef::ID) 00714 { 00715 if (seenAnId) 00716 fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName()); 00717 seenAnId = true; 00718 } 00719 } 00720 } 00721 } 00722 00723 // If we have a doc type handler, tell it the att list is ending 00724 if (fDocTypeHandler) 00725 fDocTypeHandler->endAttList(*elemDecl); 00726 } 00727 00728 00729 // 00730 // This method is called to scan the value of an attribute in content. This 00731 // involves some normalization and replacement of general entity and 00732 // character references. 00733 // 00734 // End of entity's must be dealt with here. During DTD scan, they can come 00735 // from external entities. During content, they can come from any entity. 00736 // We just eat the end of entity and continue with our scan until we come 00737 // to the closing quote. If an unterminated value causes us to go through 00738 // subsequent entities, that will cause errors back in the calling code, 00739 // but there's little we can do about it here. 00740 // 00741 bool DTDScanner::scanAttValue(const XMLCh* const attrName 00742 , XMLBuffer& toFill 00743 , const XMLAttDef::AttTypes type) 00744 { 00745 enum States 00746 { 00747 InWhitespace 00748 , InContent 00749 }; 00750 00751 // Reset the target buffer 00752 toFill.reset(); 00753 00754 // Get the next char which must be a single or double quote 00755 XMLCh quoteCh; 00756 if (!fReaderMgr->skipIfQuote(quoteCh)) 00757 return false; 00758 00759 // 00760 // We have to get the current reader because we have to ignore closing 00761 // quotes until we hit the same reader again. 00762 // 00763 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum(); 00764 00765 // 00766 // Loop until we get the attribute value. Note that we use a double 00767 // loop here to avoid the setup/teardown overhead of the exception 00768 // handler on every round. 00769 // 00770 XMLCh nextCh; 00771 XMLCh secondCh = 0; 00772 States curState = InContent; 00773 bool firstNonWS = false; 00774 bool gotLeadingSurrogate = false; 00775 bool escaped; 00776 while (true) 00777 { 00778 try 00779 { 00780 while(true) 00781 { 00782 nextCh = fReaderMgr->getNextChar(); 00783 00784 if (!nextCh) 00785 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 00786 00787 // Check for our ending quote in the same entity 00788 if (nextCh == quoteCh) 00789 { 00790 if (curReader == fReaderMgr->getCurrentReaderNum()) 00791 return true; 00792 00793 // Watch for spillover into a previous entity 00794 if (curReader > fReaderMgr->getCurrentReaderNum()) 00795 { 00796 fScanner->emitError(XMLErrs::PartialMarkupInEntity); 00797 return false; 00798 } 00799 } 00800 00801 // 00802 // Check for an entity ref now, before we let it affect our 00803 // whitespace normalization logic below. We ignore the empty flag 00804 // in this one. 00805 // 00806 escaped = false; 00807 if (nextCh == chAmpersand) 00808 { 00809 if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned) 00810 { 00811 gotLeadingSurrogate = false; 00812 continue; 00813 } 00814 } 00815 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 00816 { 00817 // Check for correct surrogate pairs 00818 if (gotLeadingSurrogate) 00819 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 00820 else 00821 gotLeadingSurrogate = true; 00822 } 00823 else 00824 { 00825 if (gotLeadingSurrogate) 00826 { 00827 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 00828 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 00829 } 00830 // Its got to at least be a valid XML character 00831 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) 00832 { 00833 XMLCh tmpBuf[9]; 00834 XMLString::binToText 00835 ( 00836 nextCh 00837 , tmpBuf 00838 , 8 00839 , 16 00840 , fMemoryManager 00841 ); 00842 fScanner->emitError 00843 ( 00844 XMLErrs::InvalidCharacterInAttrValue 00845 , attrName 00846 , tmpBuf 00847 ); 00848 } 00849 00850 gotLeadingSurrogate = false; 00851 } 00852 00853 // 00854 // If its not escaped, then make sure its not a < character, which 00855 // is not allowed in attribute values. 00856 // 00857 if (!escaped && (nextCh == chOpenAngle)) 00858 fScanner->emitError(XMLErrs::BracketInAttrValue, attrName); 00859 00860 // 00861 // If the attribute is a CDATA type we do simple replacement of 00862 // tabs and new lines with spaces, if the character is not escaped 00863 // by way of a char ref. 00864 // 00865 // Otherwise, we do the standard non-CDATA normalization of 00866 // compressing whitespace to single spaces and getting rid of 00867 // leading and trailing whitespace. 00868 // 00869 if (type == XMLAttDef::CData) 00870 { 00871 if (!escaped) 00872 { 00873 if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D)) 00874 nextCh = chSpace; 00875 } 00876 } 00877 else 00878 { 00879 if (curState == InWhitespace) 00880 { 00881 if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh)) 00882 { 00883 if (firstNonWS) 00884 toFill.append(chSpace); 00885 curState = InContent; 00886 firstNonWS = true; 00887 } 00888 else 00889 { 00890 continue; 00891 } 00892 } 00893 else if (curState == InContent) 00894 { 00895 if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh)) 00896 { 00897 curState = InWhitespace; 00898 continue; 00899 } 00900 firstNonWS = true; 00901 } 00902 } 00903 00904 // Else add it to the buffer 00905 toFill.append(nextCh); 00906 00907 if (secondCh) 00908 { 00909 toFill.append(secondCh); 00910 secondCh=0; 00911 } 00912 } 00913 } 00914 00915 catch(const EndOfEntityException&) 00916 { 00917 // Just eat it and continue. 00918 gotLeadingSurrogate = false; 00919 escaped = false; 00920 } 00921 } 00922 return true; 00923 } 00924 00925 00926 bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second) 00927 { 00928 bool gotOne = false; 00929 unsigned int value = 0; 00930 00931 // 00932 // Set the radix. Its supposed to be a lower case x if hex. But, in 00933 // order to recover well, we check for an upper and put out an error 00934 // for that. 00935 // 00936 unsigned int radix = 10; 00937 00938 if (fReaderMgr->skippedChar(chLatin_x)) 00939 { 00940 radix = 16; 00941 } 00942 else if (fReaderMgr->skippedChar(chLatin_X)) 00943 { 00944 fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase); 00945 radix = 16; 00946 } 00947 00948 while (true) 00949 { 00950 const XMLCh nextCh = fReaderMgr->peekNextChar(); 00951 00952 // Watch for EOF 00953 if (!nextCh) 00954 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 00955 00956 // Break out on the terminating semicolon 00957 if (nextCh == chSemiColon) 00958 { 00959 fReaderMgr->getNextChar(); 00960 break; 00961 } 00962 00963 // 00964 // Convert this char to a binary value, or bail out if its not 00965 // one. 00966 // 00967 unsigned int nextVal; 00968 if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9)) 00969 nextVal = (unsigned int)(nextCh - chDigit_0); 00970 else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F)) 00971 nextVal= (unsigned int)(10 + (nextCh - chLatin_A)); 00972 else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f)) 00973 nextVal = (unsigned int)(10 + (nextCh - chLatin_a)); 00974 else 00975 { 00976 // 00977 // If we got at least a sigit, then do an unterminated ref 00978 // error. Else, do an expected a numerical ref thing. 00979 // 00980 if (gotOne) 00981 fScanner->emitError(XMLErrs::UnterminatedCharRef); 00982 else 00983 fScanner->emitError(XMLErrs::ExpectedNumericalCharRef); 00984 00985 return false; 00986 } 00987 00988 // 00989 // Make sure its valid for the radix. If not, then just eat the 00990 // digit and go on after issueing an error. Else, update the 00991 // running value with this new digit. 00992 // 00993 if (nextVal >= radix) 00994 { 00995 XMLCh tmpStr[2]; 00996 tmpStr[0] = nextCh; 00997 tmpStr[1] = chNull; 00998 fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr); 00999 } 01000 else 01001 { 01002 value = (value * radix) + nextVal; 01003 } 01004 01005 // Indicate that we got at least one good digit 01006 gotOne = true; 01007 01008 // Eat the char we just processed 01009 fReaderMgr->getNextChar(); 01010 } 01011 01012 // Return the char (or chars) 01013 // And check if the character expanded is valid or not 01014 if (value >= 0x10000 && value <= 0x10FFFF) 01015 { 01016 value -= 0x10000; 01017 first = XMLCh((value >> 10) + 0xD800); 01018 second = XMLCh((value & 0x3FF) + 0xDC00); 01019 } 01020 else if (value <= 0xFFFD) 01021 { 01022 first = XMLCh(value); 01023 second = 0; 01024 if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) { 01025 // Character reference was not in the valid range 01026 fScanner->emitError(XMLErrs::InvalidCharacterRef); 01027 return false; 01028 } 01029 } 01030 else { 01031 // Character reference was not in the valid range 01032 fScanner->emitError(XMLErrs::InvalidCharacterRef); 01033 return false; 01034 } 01035 01036 return true; 01037 } 01038 01039 01040 ContentSpecNode* 01041 DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse) 01042 { 01043 // Check for a PE ref here, but don't require spaces 01044 checkForPERef(false, true); 01045 01046 ValueStackOf<XMLSize_t>* arrNestedDecl=NULL; 01047 // 01048 // We know that the caller just saw an opening parenthesis, so we need 01049 // to parse until we hit the end of it; if we find several parenthesis, 01050 // store them in an array to be processed later. 01051 // 01052 // We have to check for one up front, since it could be something like 01053 // (((a)*)) etc... 01054 // 01055 ContentSpecNode* curNode = 0; 01056 while(fReaderMgr->skippedChar(chOpenParen)) 01057 { 01058 // to check entity nesting 01059 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum(); 01060 if(arrNestedDecl==NULL) 01061 arrNestedDecl=new (fMemoryManager) ValueStackOf<XMLSize_t>(5, fMemoryManager); 01062 arrNestedDecl->push(curReader); 01063 01064 // Check for a PE ref here, but don't require spaces 01065 checkForPERef(false, true); 01066 } 01067 01068 // We must find a leaf node here, either standalone or nested in the parenthesis 01069 if (!fReaderMgr->getName(bufToUse)) 01070 { 01071 fScanner->emitError(XMLErrs::ExpectedElementName); 01072 return 0; 01073 } 01074 01075 // 01076 // Create a leaf node for it. If we can find the element id for 01077 // this element, then use it. Else, we have to fault in an element 01078 // decl, marked as created because of being in a content model. 01079 // 01080 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE); 01081 if (!decl) 01082 { 01083 decl = new (fGrammarPoolMemoryManager) DTDElementDecl 01084 ( 01085 bufToUse.getRawBuffer() 01086 , fEmptyNamespaceId 01087 , DTDElementDecl::Any 01088 , fGrammarPoolMemoryManager 01089 ); 01090 decl->setCreateReason(XMLElementDecl::InContentModel); 01091 decl->setExternalElemDeclaration(isReadingExternalEntity()); 01092 fDTDGrammar->putElemDecl(decl); 01093 } 01094 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode 01095 ( 01096 decl->getElementName() 01097 , fGrammarPoolMemoryManager 01098 ); 01099 01100 // Check for a PE ref here, but don't require spaces 01101 const bool gotSpaces = checkForPERef(false, true); 01102 01103 // Check for a repetition character after the leaf 01104 XMLCh repCh = fReaderMgr->peekNextChar(); 01105 ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fGrammarPoolMemoryManager); 01106 if (tmpNode != curNode) 01107 { 01108 if (gotSpaces) 01109 { 01110 if (fScanner->emitErrorWillThrowException(XMLErrs::UnexpectedWhitespace)) 01111 { 01112 delete tmpNode; 01113 } 01114 fScanner->emitError(XMLErrs::UnexpectedWhitespace); 01115 } 01116 fReaderMgr->getNextChar(); 01117 curNode = tmpNode; 01118 } 01119 01120 while(arrNestedDecl==NULL || !arrNestedDecl->empty()) 01121 { 01122 // Check for a PE ref here, but don't require spaces 01123 checkForPERef(false, true); 01124 01125 // 01126 // Ok, the next character tells us what kind of content this particular 01127 // model this particular parentesized section is. Its either a choice if 01128 // we see ',', a sequence if we see '|', or a single leaf node if we see 01129 // a closing paren. 01130 // 01131 const XMLCh opCh = fReaderMgr->peekNextChar(); 01132 01133 if ((opCh != chComma) 01134 && (opCh != chPipe) 01135 && (opCh != chCloseParen)) 01136 { 01137 // Not a legal char, so delete our node and return failure 01138 delete curNode; 01139 fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf); 01140 return 0; 01141 } 01142 01143 // 01144 // Create the head node of the correct type. We need this to remember 01145 // the top of the local tree. If it was a single subexpr, then just 01146 // set the head node to the current node. For the others, we'll build 01147 // the tree off the second child as we move across. 01148 // 01149 ContentSpecNode* headNode = 0; 01150 ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType; 01151 if (opCh == chComma) 01152 { 01153 curType = ContentSpecNode::Sequence; 01154 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode 01155 ( 01156 curType 01157 , curNode 01158 , 0 01159 , true 01160 , true 01161 , fGrammarPoolMemoryManager 01162 ); 01163 curNode = headNode; 01164 } 01165 else if (opCh == chPipe) 01166 { 01167 curType = ContentSpecNode::Choice; 01168 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode 01169 ( 01170 curType 01171 , curNode 01172 , 0 01173 , true 01174 , true 01175 , fGrammarPoolMemoryManager 01176 ); 01177 curNode = headNode; 01178 } 01179 else 01180 { 01181 headNode = curNode; 01182 fReaderMgr->getNextChar(); 01183 } 01184 01185 // 01186 // If it was a sequence or choice, we just loop until we get to the 01187 // end of our section, adding each new leaf or sub expression to the 01188 // right child of the current node, and making that new node the current 01189 // node. 01190 // 01191 if ((opCh == chComma) || (opCh == chPipe)) 01192 { 01193 ContentSpecNode* lastNode = 0; 01194 while (true) 01195 { 01196 // 01197 // The next thing must either be another | or , character followed 01198 // by another leaf or subexpression, or a closing parenthesis, or a 01199 // PE ref. 01200 // 01201 if (fReaderMgr->lookingAtChar(chPercent)) 01202 { 01203 checkForPERef(false, true); 01204 } 01205 else if (fReaderMgr->skippedSpace()) 01206 { 01207 // Just skip whitespace 01208 fReaderMgr->skipPastSpaces(); 01209 } 01210 else if (fReaderMgr->skippedChar(chCloseParen)) 01211 { 01212 // 01213 // We've hit the end of this section, so break out. But, we 01214 // need to see if we left a partial sequence of choice node 01215 // without a second node. If so, we have to undo that and 01216 // put its left child into the right node of the previous 01217 // node. 01218 // 01219 if ((curNode->getType() == ContentSpecNode::Choice) 01220 || (curNode->getType() == ContentSpecNode::Sequence)) 01221 { 01222 if (!curNode->getSecond()) 01223 { 01224 ContentSpecNode* saveFirst = curNode->orphanFirst(); 01225 lastNode->setSecond(saveFirst); 01226 curNode = lastNode; 01227 } 01228 } 01229 break; 01230 } 01231 else if (fReaderMgr->skippedChar(opCh)) 01232 { 01233 // Check for a PE ref here, but don't require spaces 01234 checkForPERef(false, true); 01235 01236 if (fReaderMgr->skippedChar(chOpenParen)) 01237 { 01238 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum(); 01239 01240 // Recurse to handle this new guy 01241 ContentSpecNode* subNode; 01242 try { 01243 subNode = scanChildren(elemDecl, bufToUse); 01244 } 01245 catch (const XMLErrs::Codes) 01246 { 01247 delete headNode; 01248 throw; 01249 } 01250 01251 // If it failed, we are done, clean up here and return failure 01252 if (!subNode) 01253 { 01254 delete headNode; 01255 return 0; 01256 } 01257 01258 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always) 01259 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 01260 01261 // Else patch it in and make it the new current 01262 ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode 01263 ( 01264 curType 01265 , subNode 01266 , 0 01267 , true 01268 , true 01269 , fGrammarPoolMemoryManager 01270 ); 01271 curNode->setSecond(newCur); 01272 lastNode = curNode; 01273 curNode = newCur; 01274 } 01275 else 01276 { 01277 // 01278 // Got to be a leaf node, so get a name. If we cannot get 01279 // one, then clean up and get outa here. 01280 // 01281 if (!fReaderMgr->getName(bufToUse)) 01282 { 01283 delete headNode; 01284 fScanner->emitError(XMLErrs::ExpectedElementName); 01285 return 0; 01286 } 01287 01288 // 01289 // Create a leaf node for it. If we can find the element 01290 // id for this element, then use it. Else, we have to 01291 // fault in an element decl, marked as created because 01292 // of being in a content model. 01293 // 01294 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE); 01295 if (!decl) 01296 { 01297 decl = new (fGrammarPoolMemoryManager) DTDElementDecl 01298 ( 01299 bufToUse.getRawBuffer() 01300 , fEmptyNamespaceId 01301 , DTDElementDecl::Any 01302 , fGrammarPoolMemoryManager 01303 ); 01304 decl->setCreateReason(XMLElementDecl::InContentModel); 01305 decl->setExternalElemDeclaration(isReadingExternalEntity()); 01306 fDTDGrammar->putElemDecl(decl); 01307 } 01308 01309 ContentSpecNode* tmpLeaf = new (fGrammarPoolMemoryManager) ContentSpecNode 01310 ( 01311 decl->getElementName() 01312 , fGrammarPoolMemoryManager 01313 ); 01314 01315 // Check for a repetition character after the leaf 01316 const XMLCh repCh = fReaderMgr->peekNextChar(); 01317 ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fGrammarPoolMemoryManager); 01318 if (tmpLeaf != tmpLeaf2) 01319 fReaderMgr->getNextChar(); 01320 01321 // 01322 // Create a new sequence or choice node, with the leaf 01323 // (or rep surrounding it) we just got as its first node. 01324 // Make the new node the second node of the current node, 01325 // and then make it the current node. 01326 // 01327 ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode 01328 ( 01329 curType 01330 , tmpLeaf2 01331 , 0 01332 , true 01333 , true 01334 , fGrammarPoolMemoryManager 01335 ); 01336 curNode->setSecond(newCur); 01337 lastNode = curNode; 01338 curNode = newCur; 01339 } 01340 } 01341 else 01342 { 01343 // Cannot be valid 01344 delete headNode; // emitError may do a throw so need to clean-up first 01345 if (opCh == chComma) 01346 { 01347 fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen); 01348 } 01349 else 01350 { 01351 fScanner->emitError 01352 ( 01353 XMLErrs::ExpectedSeqOrCloseParen 01354 , elemDecl.getFullName() 01355 ); 01356 } 01357 return 0; 01358 } 01359 } 01360 } 01361 01362 // 01363 // We saw the terminating parenthesis so lets check for any repetition 01364 // character, and create a node for that, making the head node the child 01365 // of it. 01366 // 01367 const XMLCh repCh = fReaderMgr->peekNextChar(); 01368 curNode = makeRepNode(repCh, headNode, fGrammarPoolMemoryManager); 01369 if (curNode != headNode) 01370 fReaderMgr->getNextChar(); 01371 01372 // prepare for recursion 01373 if(arrNestedDecl==NULL) 01374 break; 01375 else 01376 { 01377 // If that failed, no need to go further, return failure 01378 if (!curNode) 01379 return 0; 01380 01381 const XMLSize_t curReader = arrNestedDecl->pop(); 01382 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always) 01383 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 01384 01385 if(arrNestedDecl->empty()) 01386 { 01387 delete arrNestedDecl; 01388 arrNestedDecl=NULL; 01389 } 01390 } 01391 } 01392 01393 return curNode; 01394 } 01395 01396 01397 // 01398 // We get here after the '<!--' part of the comment. We scan past the 01399 // terminating '-->' It will calls the appropriate handler with the comment 01400 // text, if one is provided. A comment can be in either the document or 01401 // the DTD, so the fInDocument flag is used to know which handler to send 01402 // it to. 01403 // 01404 void DTDScanner::scanComment() 01405 { 01406 enum States 01407 { 01408 InText 01409 , OneDash 01410 , TwoDashes 01411 }; 01412 01413 // Get a buffer for this 01414 XMLBufBid bbComment(fBufMgr); 01415 01416 // 01417 // Get the comment text into a temp buffer. Be sure to use temp buffer 01418 // two here, since its to be used for stuff that is potentially longer 01419 // than just a name. 01420 // 01421 bool gotLeadingSurrogate = false; 01422 States curState = InText; 01423 while (true) 01424 { 01425 // Get the next character 01426 const XMLCh nextCh = fReaderMgr->getNextChar(); 01427 01428 // Watch for an end of file 01429 if (!nextCh) 01430 { 01431 fScanner->emitError(XMLErrs::UnterminatedComment); 01432 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 01433 } 01434 01435 // Check for correct surrogate pairs 01436 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 01437 { 01438 if (gotLeadingSurrogate) 01439 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 01440 else 01441 gotLeadingSurrogate = true; 01442 } 01443 else 01444 { 01445 if (gotLeadingSurrogate) 01446 { 01447 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 01448 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 01449 } 01450 // Its got to at least be a valid XML character 01451 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) { 01452 01453 XMLCh tmpBuf[9]; 01454 XMLString::binToText 01455 ( 01456 nextCh 01457 , tmpBuf 01458 , 8 01459 , 16 01460 , fMemoryManager 01461 ); 01462 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); 01463 } 01464 01465 gotLeadingSurrogate = false; 01466 } 01467 01468 if (curState == InText) 01469 { 01470 // If its a dash, go to OneDash state. Otherwise take as text 01471 if (nextCh == chDash) 01472 curState = OneDash; 01473 else 01474 bbComment.append(nextCh); 01475 } 01476 else if (curState == OneDash) 01477 { 01478 // 01479 // If its another dash, then we change to the two dashes states. 01480 // Otherwise, we have to put in the deficit dash and the new 01481 // character and go back to InText. 01482 // 01483 if (nextCh == chDash) 01484 { 01485 curState = TwoDashes; 01486 } 01487 else 01488 { 01489 bbComment.append(chDash); 01490 bbComment.append(nextCh); 01491 curState = InText; 01492 } 01493 } 01494 else if (curState == TwoDashes) 01495 { 01496 // The next character must be the closing bracket 01497 if (nextCh != chCloseAngle) 01498 { 01499 fScanner->emitError(XMLErrs::IllegalSequenceInComment); 01500 fReaderMgr->skipPastChar(chCloseAngle); 01501 return; 01502 } 01503 break; 01504 } 01505 } 01506 01507 // If there is a doc type handler, then pass on the comment stuff 01508 if (fDocTypeHandler) 01509 fDocTypeHandler->doctypeComment(bbComment.getRawBuffer()); 01510 } 01511 01512 01513 bool DTDScanner::scanContentSpec(DTDElementDecl& toFill) 01514 { 01515 // 01516 // Check for for a couple of the predefined content type strings. If 01517 // its not one of these, its got to be a parenthesized reg ex type 01518 // expression. 01519 // 01520 if (fReaderMgr->skippedString(XMLUni::fgEmptyString)) 01521 { 01522 toFill.setModelType(DTDElementDecl::Empty); 01523 return true; 01524 } 01525 01526 if (fReaderMgr->skippedString(XMLUni::fgAnyString)) 01527 { 01528 toFill.setModelType(DTDElementDecl::Any); 01529 return true; 01530 } 01531 01532 // Its got to be a parenthesized regular expression 01533 if (!fReaderMgr->skippedChar(chOpenParen)) 01534 { 01535 fScanner->emitError 01536 ( 01537 XMLErrs::ExpectedContentSpecExpr 01538 , toFill.getFullName() 01539 ); 01540 return false; 01541 } 01542 01543 // Get the current reader id, so we can test for partial markup 01544 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum(); 01545 01546 // We could have a PE ref here, but don't require space 01547 checkForPERef(false, true); 01548 01549 // 01550 // Now we look for a PCDATA string. If its PCDATA, then it must be a 01551 // MIXED model. Otherwise, it must be a regular list of children in 01552 // a regular expression perhaps. 01553 // 01554 bool status; 01555 if (fReaderMgr->skippedString(XMLUni::fgPCDATAString)) 01556 { 01557 // Set the model to mixed 01558 toFill.setModelType(DTDElementDecl::Mixed_Simple); 01559 status = scanMixed(toFill); 01560 01561 // 01562 // If we are validating we have to check that there are no multiple 01563 // uses of any child elements. 01564 // 01565 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 01566 { 01567 if (((const MixedContentModel*)toFill.getContentModel())->hasDups()) 01568 fScanner->getValidator()->emitError(XMLValid::RepElemInMixed); 01569 } 01570 } 01571 else 01572 { 01573 // 01574 // We have to do a recursive scan of the content model. Create a 01575 // buffer for it to use, for efficiency. It returns the top ofthe 01576 // content spec node tree, which we set if successful. 01577 // 01578 toFill.setModelType(DTDElementDecl::Children); 01579 XMLBufBid bbTmp(fBufMgr); 01580 ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer()); 01581 status = (resNode != 0); 01582 if (status) 01583 toFill.setContentSpec(resNode); 01584 } 01585 01586 // Make sure we are on the same reader as where we started 01587 if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always) 01588 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 01589 01590 return status; 01591 } 01592 01593 01594 void DTDScanner::scanDefaultDecl(DTDAttDef& toFill) 01595 { 01596 if (fReaderMgr->skippedString(XMLUni::fgRequiredString)) 01597 { 01598 toFill.setDefaultType(XMLAttDef::Required); 01599 return; 01600 } 01601 01602 if (fReaderMgr->skippedString(XMLUni::fgImpliedString)) 01603 { 01604 toFill.setDefaultType(XMLAttDef::Implied); 01605 return; 01606 } 01607 01608 if (fReaderMgr->skippedString(XMLUni::fgFixedString)) 01609 { 01610 // 01611 // There must be space before the fixed value. If there is not, then 01612 // emit an error but keep going. 01613 // 01614 if (!fReaderMgr->skippedSpace()) 01615 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01616 else 01617 fReaderMgr->skipPastSpaces(); 01618 toFill.setDefaultType(XMLAttDef::Fixed); 01619 } 01620 else 01621 { 01622 toFill.setDefaultType(XMLAttDef::Default); 01623 } 01624 01625 // 01626 // If we got here, its fixed or default, so we need to get a value. 01627 // If we don't, then emit an error but just set the default value to 01628 // an empty string and try to keep going. 01629 // 01630 // Check for PE ref or optional whitespace 01631 checkForPERef(false, true); 01632 01633 XMLBufBid bbValue(fBufMgr); 01634 if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType())) 01635 fScanner->emitError(XMLErrs::ExpectedDefAttrDecl); 01636 01637 toFill.setValue(bbValue.getRawBuffer()); 01638 } 01639 01640 01641 // 01642 // This is called after seeing '<!ELEMENT' which indicates that an element 01643 // markup is starting. This guy scans the rest of it and adds it to the 01644 // element decl pool if it has not already been declared. 01645 // 01646 void DTDScanner::scanElementDecl() 01647 { 01648 // 01649 // Space is legal (required actually) here so check for a PE ref. If 01650 // we don't get our whitespace, then issue and error, but try to keep 01651 // going. 01652 // 01653 if (!checkForPERef(false, true)) 01654 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01655 01656 // Get a buffer for the element name and scan in the name 01657 XMLBufBid bbName(fBufMgr); 01658 if (!fReaderMgr->getName(bbName.getBuffer())) 01659 { 01660 fScanner->emitError(XMLErrs::ExpectedElementName); 01661 fReaderMgr->skipPastChar(chCloseAngle); 01662 return; 01663 } 01664 01665 // Look this guy up in the element decl pool 01666 DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE); 01667 01668 // 01669 // If it does not exist, then we need to create it. If it does and 01670 // its marked as declared, then that's an error, but we still need to 01671 // scan over the content model so use the dummy declaration that the 01672 // parsing code can fill in. 01673 // 01674 if (decl) 01675 { 01676 if (decl->isDeclared()) 01677 { 01678 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 01679 fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer()); 01680 01681 if (!fDumElemDecl) 01682 fDumElemDecl = new (fMemoryManager) DTDElementDecl 01683 ( 01684 bbName.getRawBuffer() 01685 , fEmptyNamespaceId 01686 , DTDElementDecl::Any 01687 , fMemoryManager 01688 ); 01689 else 01690 fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId); 01691 } 01692 } 01693 else 01694 { 01695 // 01696 // Create the new empty declaration to fill in and put it into 01697 // the decl pool. 01698 // 01699 decl = new (fGrammarPoolMemoryManager) DTDElementDecl 01700 ( 01701 bbName.getRawBuffer() 01702 , fEmptyNamespaceId 01703 , DTDElementDecl::Any 01704 , fGrammarPoolMemoryManager 01705 ); 01706 fDTDGrammar->putElemDecl(decl); 01707 } 01708 01709 // Set a flag for whether we will ignore this one 01710 const bool isIgnored = (decl == fDumElemDecl); 01711 01712 // Mark this one if being externally declared 01713 decl->setExternalElemDeclaration(isReadingExternalEntity()); 01714 01715 // Mark this one as being declared 01716 decl->setCreateReason(XMLElementDecl::Declared); 01717 01718 // Another check for a PE ref, with at least required whitespace 01719 if (!checkForPERef(false, true)) 01720 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01721 01722 // And now scan the content model for this guy. 01723 if (!scanContentSpec(*decl)) 01724 { 01725 fReaderMgr->skipPastChar(chCloseAngle); 01726 return; 01727 } 01728 01729 // Another check for a PE ref, but we don't require whitespace here 01730 checkForPERef(false, true); 01731 01732 // And we should have the ending angle bracket 01733 if (!fReaderMgr->skippedChar(chCloseAngle)) 01734 { 01735 fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer()); 01736 fReaderMgr->skipPastChar(chCloseAngle); 01737 } 01738 01739 // 01740 // If we have a DTD handler tell it about the new element decl. We 01741 // tell it if its one that can be ignored, cause its an override of a 01742 // previously existing decl. If it is being ignored, only call back 01743 // if advanced callbacks are enabled. 01744 // 01745 if (fDocTypeHandler) 01746 fDocTypeHandler->elementDecl(*decl, isIgnored); 01747 } 01748 01749 01750 // 01751 // This method will process a general or parameter entity reference. The 01752 // entity name and entity text will be stored in the entity pool. The value 01753 // of the entity will be scanned for any other parameter entity or char 01754 // references which will be expanded. So the stored value can only have 01755 // general entity references when done. 01756 // 01757 void DTDScanner::scanEntityDecl() 01758 { 01759 // 01760 // Space is required here, but we cannot check for a PE Ref since 01761 // there could be a legal (no-ref) percent sign here. Since any 01762 // entity that ended here would be illegal, we just skip spaces 01763 // and then check for a percent. 01764 // 01765 if (!fReaderMgr->lookingAtSpace()) 01766 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01767 else 01768 fReaderMgr->skipPastSpaces(); 01769 bool isPEDecl = fReaderMgr->skippedChar(chPercent); 01770 01771 // 01772 // If a PE decl, then check if it is followed by a space; if it is so, 01773 // eat the percent and check for spaces or a PE ref on the other side of it. 01774 // Otherwise, it has to be an entity reference for a general entity. 01775 // 01776 if (isPEDecl) 01777 { 01778 if(!fReaderMgr->getCurrentReader()->isWhitespace(fReaderMgr->peekNextChar())) 01779 { 01780 isPEDecl=false; 01781 while (true) 01782 { 01783 if (!expandPERef(false, false, true, false)) 01784 fScanner->emitError(XMLErrs::ExpectedEntityRefName); 01785 // And skip any more spaces in the expanded value 01786 if (fReaderMgr->skippedSpace()) 01787 fReaderMgr->skipPastSpaces(); 01788 if (!fReaderMgr->skippedChar(chPercent)) 01789 break; 01790 } 01791 } 01792 else if (!checkForPERef(false, true)) 01793 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01794 } 01795 01796 // 01797 // Now lets get a name, which should be the name of the entity. We 01798 // have to get a buffer for this. 01799 // 01800 XMLBufBid bbName(fBufMgr); 01801 if (!fReaderMgr->getName(bbName.getBuffer())) 01802 { 01803 fScanner->emitError(XMLErrs::ExpectedPEName); 01804 fReaderMgr->skipPastChar(chCloseAngle); 01805 return; 01806 } 01807 01808 // If namespaces are enabled, then no colons allowed 01809 if (fScanner->getDoNamespaces()) 01810 { 01811 if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1) 01812 fScanner->emitError(XMLErrs::ColonNotLegalWithNS); 01813 } 01814 01815 // 01816 // See if this entity already exists. If so, then the existing one 01817 // takes precendence. So we use the local dummy decl to parse into 01818 // and just ignore the results. 01819 // 01820 DTDEntityDecl* entityDecl; 01821 if (isPEDecl) 01822 entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer()); 01823 else 01824 entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer()); 01825 01826 if (entityDecl) 01827 { 01828 if (!fDumEntityDecl) 01829 fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager); 01830 fDumEntityDecl->setName(bbName.getRawBuffer()); 01831 entityDecl = fDumEntityDecl; 01832 } 01833 else 01834 { 01835 // Its not in existence already, then create an entity decl for it 01836 entityDecl = new (fGrammarPoolMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fGrammarPoolMemoryManager); 01837 01838 // 01839 // Set the declaration location. The parameter indicates whether its 01840 // declared in the content/internal subset, so we know whether or not 01841 // its in the external subset. 01842 // 01843 entityDecl->setDeclaredInIntSubset(fInternalSubset); 01844 01845 // Add it to the appropriate entity decl pool 01846 if (isPEDecl) 01847 fPEntityDeclPool->put(entityDecl); 01848 else 01849 fDTDGrammar->putEntityDecl(entityDecl); 01850 } 01851 01852 // Set a flag that indicates whether we are ignoring this one 01853 const bool isIgnored = (entityDecl == fDumEntityDecl); 01854 01855 // Set the PE flag on it 01856 entityDecl->setIsParameter(isPEDecl); 01857 01858 // 01859 // Space is legal (required actually) here so check for a PE ref. If 01860 // we don't get our whitespace, then issue an error, but try to keep 01861 // going. 01862 // 01863 if (!checkForPERef(false, true)) 01864 fScanner->emitError(XMLErrs::ExpectedWhitespace); 01865 01866 // save the hasNoDTD status for Entity Constraint Checking 01867 bool hasNoDTD = fScanner->getHasNoDTD(); 01868 if (hasNoDTD && isPEDecl) 01869 fScanner->setHasNoDTD(false); 01870 01871 // According to the type call the value scanning method 01872 if (!scanEntityDef(*entityDecl, isPEDecl)) 01873 { 01874 fReaderMgr->skipPastChar(chCloseAngle); 01875 fScanner->setHasNoDTD(true); 01876 fScanner->emitError(XMLErrs::ExpectedEntityValue); 01877 return; 01878 } 01879 if (hasNoDTD) 01880 fScanner->setHasNoDTD(true); 01881 01882 // Space is legal (but not required) here so check for a PE ref 01883 checkForPERef(false, true); 01884 01885 // And then we have to have the closing angle bracket 01886 if (!fReaderMgr->skippedChar(chCloseAngle)) 01887 { 01888 fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName()); 01889 fReaderMgr->skipPastChar(chCloseAngle); 01890 } 01891 01892 // 01893 // If we have a doc type handler, then call it. But only call it for 01894 // ignored elements if advanced callbacks are enabled. 01895 // 01896 if (fDocTypeHandler) 01897 fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored); 01898 } 01899 01900 01901 // 01902 // This method will scan a general/character entity ref. It will either 01903 // expand a char ref and return the value directly, or it will expand 01904 // a general entity and a reader for it onto the reader stack. 01905 // 01906 // The return value indicates whether the value was returned directly or 01907 // pushed as a reader or it failed. 01908 // 01909 // The escaped flag tells the caller whether the returnd parameter resulted 01910 // from a character reference, which escapes the character in some cases. It 01911 // only makes any difference if the return indicates the value was returned 01912 // directly. 01913 // 01914 // NOTE: This is only called when scanning attribute values, so we always 01915 // expand general entities. 01916 // 01917 DTDScanner::EntityExpRes 01918 DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped) 01919 { 01920 // Assume no escape and no second char 01921 escaped = false; 01922 secondCh = 0; 01923 01924 // We have to insure its all done in a single entity 01925 const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum(); 01926 01927 // 01928 // If the next char is a pound, then its a character reference and we 01929 // need to expand it always. 01930 // 01931 if (fReaderMgr->skippedChar(chPound)) 01932 { 01933 // 01934 // Its a character reference, so scan it and get back the numeric 01935 // value it represents. If it fails, just return immediately. 01936 // 01937 if (!scanCharRef(firstCh, secondCh)) 01938 return EntityExp_Failed; 01939 01940 if (curReader != fReaderMgr->getCurrentReaderNum()) 01941 fScanner->emitError(XMLErrs::PartialMarkupInEntity); 01942 01943 // Its now escaped since it was a char ref 01944 escaped = true; 01945 return EntityExp_Returned; 01946 } 01947 01948 // Get the name of the general entity 01949 XMLBufBid bbName(fBufMgr); 01950 if (!fReaderMgr->getName(bbName.getBuffer())) 01951 { 01952 fScanner->emitError(XMLErrs::ExpectedEntityRefName); 01953 return EntityExp_Failed; 01954 } 01955 01956 // 01957 // Next char must be a semi-colon. But if its not, just emit 01958 // an error and try to continue. 01959 // 01960 if (!fReaderMgr->skippedChar(chSemiColon)) 01961 fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer()); 01962 01963 // Make sure it was all in one entity reader 01964 if (curReader != fReaderMgr->getCurrentReaderNum()) 01965 fScanner->emitError(XMLErrs::PartialMarkupInEntity); 01966 01967 // Look it up the name the general entity pool 01968 XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer()); 01969 01970 // If it does not exist, then obviously an error 01971 if (!decl) 01972 { 01973 // XML 1.0 Section 4.1 01974 if (fScanner->getStandalone() || fScanner->getHasNoDTD()) { 01975 fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer()); 01976 } 01977 else { 01978 if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 01979 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer()); 01980 } 01981 01982 return EntityExp_Failed; 01983 } 01984 01985 01986 // 01987 // XML 1.0 Section 4.1 01988 // If we are a standalone document, then it has to have been declared 01989 // in the internal subset. 01990 // 01991 if (fScanner->getStandalone() && !decl->getDeclaredInIntSubset()) 01992 fScanner->emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer()); 01993 01994 // 01995 // If its a special char reference, then its escaped and we can return 01996 // it directly. 01997 // 01998 if (decl->getIsSpecialChar()) 01999 { 02000 firstCh = decl->getValue()[0]; 02001 escaped = true; 02002 return EntityExp_Returned; 02003 } 02004 02005 if (decl->isExternal()) 02006 { 02007 // If its unparsed, then its not valid here 02008 // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden. 02009 if (decl->isUnparsed()) 02010 { 02011 fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer()); 02012 return EntityExp_Failed; 02013 } 02014 02015 // We are in an attribute value, so not valid. 02016 // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden. 02017 fScanner->emitError(XMLErrs::NoExtRefsInAttValue); 02018 02019 // And now create a reader to read this entity 02020 InputSource* srcUsed; 02021 XMLReader* reader = fReaderMgr->createReader 02022 ( 02023 decl->getBaseURI() 02024 , decl->getSystemId() 02025 , decl->getPublicId() 02026 , false 02027 , XMLReader::RefFrom_NonLiteral 02028 , XMLReader::Type_General 02029 , XMLReader::Source_External 02030 , srcUsed 02031 , fScanner->getCalculateSrcOfs() 02032 , fScanner->getLowWaterMark() 02033 , fScanner->getDisableDefaultEntityResolution() 02034 ); 02035 02036 // Put a janitor on the source so it gets cleaned up on exit 02037 Janitor<InputSource> janSrc(srcUsed); 02038 02039 // 02040 // If the creation failed then throw an exception 02041 // 02042 if (!reader) 02043 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager); 02044 02045 // 02046 // Push the reader. If its a recursive expansion, then emit an error 02047 // and return an failure. 02048 // 02049 if (!fReaderMgr->pushReader(reader, decl)) 02050 { 02051 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName()); 02052 return EntityExp_Failed; 02053 } 02054 02055 // If it starts with the XML string, then parse a text decl 02056 if (fScanner->checkXMLDecl(true)) 02057 scanTextDecl(); 02058 } 02059 else 02060 { 02061 // 02062 // Create a reader over a memory stream over the entity value 02063 // We force it to assume UTF-16 by passing in an encoding 02064 // string. This way it won't both trying to predecode the 02065 // first line, looking for an XML/TextDecl. 02066 // 02067 XMLReader* valueReader = fReaderMgr->createIntEntReader 02068 ( 02069 decl->getName() 02070 , XMLReader::RefFrom_NonLiteral 02071 , XMLReader::Type_General 02072 , decl->getValue() 02073 , decl->getValueLen() 02074 , false 02075 ); 02076 02077 // 02078 // Trt to push the entity reader onto the reader manager stack, 02079 // where it will become the subsequent input. If it fails, that 02080 // means the entity is recursive, so issue an error. The reader 02081 // will have just been discarded, but we just keep going. 02082 // 02083 if (!fReaderMgr->pushReader(valueReader, decl)) 02084 fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName()); 02085 } 02086 02087 return EntityExp_Pushed; 02088 } 02089 02090 02091 // 02092 // This method will scan a quoted literal of an entity value. It has to 02093 // deal with replacement of PE references; however, since this is a DTD 02094 // scanner, all such entity literals are in entity decls and therefore 02095 // general entities are not expanded. 02096 // 02097 bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill) 02098 { 02099 toFill.reset(); 02100 02101 // Get the next char which must be a single or double quote 02102 XMLCh quoteCh; 02103 if (!fReaderMgr->skipIfQuote(quoteCh)) 02104 return false; 02105 02106 // Get a buffer for pulling in entity names when we see GE refs 02107 XMLBufBid bbName(fBufMgr); 02108 XMLBuffer& nameBuf = bbName.getBuffer(); 02109 02110 // Remember the current reader 02111 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 02112 02113 // 02114 // Loop until we see the ending quote character, handling any references 02115 // in the process. 02116 // 02117 XMLCh nextCh; 02118 XMLCh secondCh = 0; 02119 bool gotLeadingSurrogate = false; 02120 while (true) 02121 { 02122 nextCh = fReaderMgr->getNextChar(); 02123 02124 // 02125 // Watch specifically for EOF and issue a more meaningful error 02126 // if that occurs (since an unterminated quoted char can cause 02127 // this easily.) 02128 // 02129 if (!nextCh) 02130 { 02131 fScanner->emitError(XMLErrs::UnterminatedEntityLiteral); 02132 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 02133 } 02134 02135 // 02136 // Break out on our terminating quote char when we are back in the 02137 // same reader. Otherwise, we might trigger on a nested quote char 02138 // in an expanded entity. 02139 // 02140 if ((nextCh == quoteCh) 02141 && (fReaderMgr->getCurrentReaderNum() == orgReader)) 02142 { 02143 break; 02144 } 02145 02146 if (nextCh == chPercent) 02147 { 02148 // 02149 // Put the PE's value on the reader stack and then jump back 02150 // to the top to start processing it. The parameter indicates 02151 // that it should not scan the reference's content as an external 02152 // subset. 02153 // 02154 expandPERef(false, true, true); 02155 continue; 02156 } 02157 02158 // 02159 // Ok, now that all the other special stuff is checked, we can 02160 // look for a general entity. In here, we cannot have a naked & 02161 // and will only expand numerical char refs or the intrinsic char 02162 // refs. Others will be left alone. 02163 // 02164 if (nextCh == chAmpersand) 02165 { 02166 // 02167 // Here, we only expand numeric char refs, but not any general 02168 // entities. However, the stupid XML spec requires that we check 02169 // and make sure it does refer to a general entity if its not 02170 // a char ref (i.e. no naked '&' chars.) 02171 // 02172 if (fReaderMgr->skippedChar(chPound)) 02173 { 02174 // If it failed, then just jump back to the top and try to pick up 02175 if (!scanCharRef(nextCh, secondCh)) 02176 { 02177 gotLeadingSurrogate = false; 02178 continue; 02179 } 02180 } 02181 else 02182 { 02183 if (!fReaderMgr->getName(nameBuf)) 02184 { 02185 fScanner->emitError(XMLErrs::ExpectedEntityRefName); 02186 } 02187 else 02188 { 02189 // 02190 // Since we are not expanding any of this, we have to 02191 // put the amp and name into the target buffer as data. 02192 // 02193 toFill.append(chAmpersand); 02194 toFill.append(nameBuf.getRawBuffer()); 02195 02196 // Make sure we skipped a trailing semicolon 02197 if (!fReaderMgr->skippedChar(chSemiColon)) 02198 { 02199 fScanner->emitError 02200 ( 02201 XMLErrs::UnterminatedEntityRef 02202 , nameBuf.getRawBuffer() 02203 ); 02204 } 02205 02206 // And make the new character the semicolon 02207 nextCh = chSemiColon; 02208 } 02209 02210 // Either way here we reset the surrogate flag 02211 gotLeadingSurrogate = false; 02212 } 02213 } 02214 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 02215 { 02216 if (gotLeadingSurrogate) 02217 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 02218 else 02219 gotLeadingSurrogate = true; 02220 } 02221 else 02222 { 02223 if (gotLeadingSurrogate) 02224 { 02225 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 02226 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 02227 } 02228 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) 02229 { 02230 XMLCh tmpBuf[9]; 02231 XMLString::binToText 02232 ( 02233 nextCh 02234 , tmpBuf 02235 , 8 02236 , 16 02237 , fMemoryManager 02238 ); 02239 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); 02240 fReaderMgr->skipPastChar(quoteCh); 02241 return false; 02242 } 02243 gotLeadingSurrogate = false; 02244 } 02245 02246 // Looks ok, so add it to the literal 02247 toFill.append(nextCh); 02248 02249 if (secondCh) 02250 { 02251 toFill.append(secondCh); 02252 secondCh=0; 02253 } 02254 } 02255 02256 // 02257 // If we got here and did not get back to the original reader level, 02258 // then we propogated some entity out of the literal, so issue an 02259 // error, but don't fail. 02260 // 02261 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always) 02262 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 02263 02264 return true; 02265 } 02266 02267 02268 // 02269 // This method is called after the entity name has been scanned, and any 02270 // PE referenced following the name is handled. The passed decl will be 02271 // filled in with the info scanned. 02272 // 02273 bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl) 02274 { 02275 // Its got to be an entity literal 02276 if (fReaderMgr->lookingAtChar(chSingleQuote) 02277 || fReaderMgr->lookingAtChar(chDoubleQuote)) 02278 { 02279 // Get a buffer for the literal 02280 XMLBufBid bbValue(fBufMgr); 02281 02282 if (!scanEntityLiteral(bbValue.getBuffer())) 02283 return false; 02284 02285 // Set it on the entity decl 02286 decl.setValue(bbValue.getRawBuffer()); 02287 return true; 02288 } 02289 02290 // 02291 // Its got to be an external entity, so there must be an external id. 02292 // Get buffers for them and scan an external id into them. 02293 // 02294 XMLBufBid bbPubId(fBufMgr); 02295 XMLBufBid bbSysId(fBufMgr); 02296 if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External)) 02297 return false; 02298 02299 decl.setIsExternal(true); 02300 ReaderMgr::LastExtEntityInfo lastInfo; 02301 fReaderMgr->getLastExtEntityInfo(lastInfo); 02302 02303 // Fill in the id fields of the decl with the info we got 02304 const XMLCh* publicId = bbPubId.getRawBuffer(); 02305 const XMLCh* systemId = bbSysId.getRawBuffer(); 02306 decl.setPublicId((publicId && *publicId) ? publicId : 0); 02307 decl.setSystemId((systemId && *systemId) ? systemId : 0); 02308 decl.setBaseURI((lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0); 02309 02310 // If its a PE decl, we are done 02311 bool gotSpaces = checkForPERef(false, true); 02312 if (isPEDecl) 02313 { 02314 // 02315 // Check for a common error here. NDATA is not allowed for PEs 02316 // so check for the NDATA string. If found give a nice meaningful 02317 // error and continue parsing to eat the NDATA text. 02318 // 02319 if (gotSpaces) 02320 { 02321 if (fReaderMgr->skippedString(XMLUni::fgNDATAString)) 02322 fScanner->emitError(XMLErrs::NDATANotValidForPE); 02323 } 02324 else 02325 { 02326 return true; 02327 } 02328 } 02329 02330 // If looking at close angle now, we are done 02331 if (fReaderMgr->lookingAtChar(chCloseAngle)) 02332 return true; 02333 02334 // Else we had to have seem the whitespace 02335 if (!gotSpaces) 02336 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02337 02338 // We now have to see a notation data string 02339 if (!fReaderMgr->skippedString(XMLUni::fgNDATAString)) 02340 fScanner->emitError(XMLErrs::ExpectedNDATA); 02341 02342 // Space is required here, but try to go on if not 02343 if (!checkForPERef(false, true)) 02344 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02345 02346 // Get a name 02347 XMLBufBid bbName(fBufMgr); 02348 if (!fReaderMgr->getName(bbName.getBuffer())) 02349 { 02350 fScanner->emitError(XMLErrs::ExpectedNotationName); 02351 return false; 02352 } 02353 02354 // Set the decl's notation name 02355 decl.setNotationName(bbName.getRawBuffer()); 02356 02357 return true; 02358 } 02359 02360 02361 // 02362 // This method is called after an attribute decl name or a notation decl has 02363 // been scanned and then an opening parenthesis was see, indicating the list 02364 // of values. It scans the enumeration values and creates a single string 02365 // which has a single space between each value. 02366 // 02367 // The terminating close paren ends this scan. 02368 // 02369 bool DTDScanner::scanEnumeration( const DTDAttDef& attDef 02370 , XMLBuffer& toFill 02371 , const bool notation) 02372 { 02373 // Reset the passed buffer 02374 toFill.reset(); 02375 02376 // Check for PE ref but don't require space 02377 checkForPERef(false, true); 02378 02379 // If this is a notation, we need an opening paren 02380 if (notation) 02381 { 02382 if (!fReaderMgr->skippedChar(chOpenParen)) 02383 fScanner->emitError(XMLErrs::ExpectedOpenParen); 02384 } 02385 02386 // We need a local buffer to use as well 02387 XMLBufBid bbTmp(fBufMgr); 02388 02389 while (true) 02390 { 02391 // Space is allowed here for either type so check for PE ref 02392 checkForPERef(false, true); 02393 02394 // And then get either a name or a name token 02395 bool success; 02396 if (notation) 02397 success = fReaderMgr->getName(bbTmp.getBuffer()); 02398 else 02399 success = fReaderMgr->getNameToken(bbTmp.getBuffer()); 02400 02401 if (!success) 02402 { 02403 fScanner->emitError 02404 ( 02405 XMLErrs::ExpectedEnumValue 02406 , attDef.getFullName() 02407 ); 02408 return false; 02409 } 02410 02411 // Append this value to the target value 02412 toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen()); 02413 02414 // Space is allowed here for either type so check for PE ref 02415 checkForPERef(false, true); 02416 02417 // Check for the terminating paren 02418 if (fReaderMgr->skippedChar(chCloseParen)) 02419 break; 02420 02421 // And append a space separator 02422 toFill.append(chSpace); 02423 02424 // Check for the pipe character separator 02425 if (!fReaderMgr->skippedChar(chPipe)) 02426 { 02427 fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen); 02428 return false; 02429 } 02430 } 02431 return true; 02432 } 02433 02434 02435 bool DTDScanner::scanEq() 02436 { 02437 fReaderMgr->skipPastSpaces(); 02438 if (fReaderMgr->skippedChar(chEqual)) 02439 { 02440 fReaderMgr->skipPastSpaces(); 02441 return true; 02442 } 02443 return false; 02444 } 02445 02446 02447 // 02448 // This method is called when an external entity reference is seen in the 02449 // DTD or an external DTD subset is encountered, and their contents pushed 02450 // onto the reader stack. This method will scan that contents. 02451 // 02452 void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect, const bool isDTD) 02453 { 02454 // Indicate we are in the external subset now 02455 FlagJanitor<bool> janContentFlag(&fInternalSubset, false); 02456 02457 02458 bool bAcceptDecl = !inIncludeSect; 02459 02460 // Get a buffer for whitespace 02461 XMLBufBid bbSpace(fBufMgr); 02462 02463 // 02464 // If we have a doc type handler and we are not being called recursively 02465 // to handle an include section, tell it the ext subset starts 02466 // 02467 if (fDocTypeHandler && isDTD && !inIncludeSect) 02468 fDocTypeHandler->startExtSubset(); 02469 02470 // 02471 // We have to play a trick here if the current entity we are parsing 02472 // is a PE. Because the spooling code will put out a whitespace before 02473 // and after an expanded PE if its being scanned outside the context of 02474 // a literal entity, this will confuse this external subset code. 02475 // 02476 // So, we see if that is what is happening and, if so, eat the single 02477 // space, a check for the <?xml string. If we find it, we parse that 02478 // markup right now and put the space back. 02479 // 02480 if (fReaderMgr->isScanningPERefOutOfLiteral()) 02481 { 02482 if (fReaderMgr->skippedSpace()) 02483 { 02484 if (fScanner->checkXMLDecl(true)) 02485 { 02486 scanTextDecl(); 02487 bAcceptDecl = false; 02488 02489 // <TBD> Figure out how to do this 02490 // fReaderMgr->unGet(chSpace); 02491 } 02492 } 02493 } 02494 02495 // Get the current reader number 02496 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 02497 02498 // 02499 // Loop until we hit the end of the external subset entity. Note that 02500 // we use a double loop here in order to avoid the overhead of doing 02501 // the exception setup/teardown work on every loop. 02502 // 02503 bool inMarkup = false; 02504 bool inCharData = false; 02505 while (true) 02506 { 02507 bool bDoBreak=false; // workaround for Borland bug with 'break' in 'catch' 02508 try 02509 { 02510 while (true) 02511 { 02512 const XMLCh nextCh = fReaderMgr->peekNextChar(); 02513 02514 if (!nextCh) 02515 { 02516 return; // nothing left 02517 } 02518 else if (nextCh == chOpenAngle) 02519 { 02520 // Get the reader we started this on 02521 // XML 1.0 P28a Well-formedness constraint: PE Between Declarations 02522 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 02523 bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE); 02524 02525 // 02526 // Now scan the markup. Set the flag so that we will know that 02527 // we were in markup if an end of entity exception occurs. 02528 // 02529 fReaderMgr->getNextChar(); 02530 inMarkup = true; 02531 scanMarkupDecl(bAcceptDecl); 02532 inMarkup = false; 02533 02534 // 02535 // And see if we got back to the same level. If not, then its 02536 // a partial markup error. 02537 // 02538 if (fReaderMgr->getCurrentReaderNum() != orgReader){ 02539 if (wasInPE) 02540 fScanner->emitError(XMLErrs::PEBetweenDecl); 02541 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 02542 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 02543 } 02544 02545 } 02546 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh)) 02547 { 02548 // 02549 // If we have a doc type handler, and advanced callbacks are 02550 // enabled, then gather up whitespace and call back. Otherwise 02551 // just skip whitespaces. 02552 // 02553 if (fDocTypeHandler) 02554 { 02555 inCharData = true; 02556 fReaderMgr->getSpaces(bbSpace.getBuffer()); 02557 inCharData = false; 02558 02559 fDocTypeHandler->doctypeWhitespace 02560 ( 02561 bbSpace.getRawBuffer() 02562 , bbSpace.getLen() 02563 ); 02564 } 02565 else 02566 { 02567 // 02568 // If we hit an end of entity in the middle of white 02569 // space, that's fine. We'll just come back in here 02570 // again on the next round and skip some more. 02571 // 02572 fReaderMgr->skipPastSpaces(); 02573 } 02574 } 02575 else if (nextCh == chPercent) 02576 { 02577 // 02578 // Expand (and scan if external) the reference value. Tell 02579 // it to throw an end of entity exception at the end of the 02580 // entity. 02581 // 02582 fReaderMgr->getNextChar(); 02583 expandPERef(true, false, false, true); 02584 } 02585 else if (inIncludeSect && (nextCh == chCloseSquare)) 02586 { 02587 // 02588 // Its the end of a conditional include section. So scan it and 02589 // decrement the include depth counter. 02590 // 02591 fReaderMgr->getNextChar(); 02592 if (!fReaderMgr->skippedChar(chCloseSquare)) 02593 { 02594 fScanner->emitError(XMLErrs::ExpectedEndOfConditional); 02595 fReaderMgr->skipPastChar(chCloseAngle); 02596 } 02597 else if (!fReaderMgr->skippedChar(chCloseAngle)) 02598 { 02599 fScanner->emitError(XMLErrs::ExpectedEndOfConditional); 02600 fReaderMgr->skipPastChar(chCloseAngle); 02601 } 02602 return; 02603 } 02604 else 02605 { 02606 fReaderMgr->getNextChar(); 02607 if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) 02608 { 02609 XMLCh tmpBuf[9]; 02610 XMLString::binToText 02611 ( 02612 nextCh 02613 , tmpBuf 02614 , 8 02615 , 16 02616 , fMemoryManager 02617 ); 02618 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); 02619 } 02620 else 02621 { 02622 fScanner->emitError(XMLErrs::InvalidDocumentStructure); 02623 } 02624 02625 // Try to get realigned 02626 static const XMLCh toSkip[] = 02627 { 02628 chPercent, chCloseSquare, chOpenAngle, chNull 02629 }; 02630 fReaderMgr->skipUntilInOrWS(toSkip); 02631 } 02632 bAcceptDecl = false; 02633 } 02634 } 02635 catch(const EndOfEntityException& toCatch) 02636 { 02637 // 02638 // If the external entity ended while we were in markup, then that's 02639 // a partial markup error. 02640 // 02641 if (inMarkup) 02642 { 02643 fScanner->emitError(XMLErrs::PartialMarkupInEntity); 02644 inMarkup = false; 02645 } 02646 02647 // If we were in char data, then send what we got 02648 if (inCharData) 02649 { 02650 // Send what we got, then rethrow 02651 if (fDocTypeHandler) 02652 { 02653 fDocTypeHandler->doctypeWhitespace 02654 ( 02655 bbSpace.getRawBuffer() 02656 , bbSpace.getLen() 02657 ); 02658 } 02659 inCharData = false; 02660 } 02661 02662 // 02663 // If the entity that just ended was the entity that we started 02664 // on, then this is the end of the external subset. 02665 // 02666 if (orgReader == toCatch.getReaderNum()) 02667 bDoBreak=true; 02668 } 02669 if(bDoBreak) 02670 break; 02671 } 02672 02673 // If we have a doc type handler, tell it the ext subset ends 02674 if (fDocTypeHandler && isDTD && !inIncludeSect) 02675 fDocTypeHandler->endExtSubset(); 02676 } 02677 02678 02679 // 02680 // This method will scan for an id, either public or external. 02681 // 02682 // 02683 // [75] ExternalID ::= 'SYSTEM' S SystemLiteral 02684 // | 'PUBLIC' S PubidLiteral S SystemLiteral 02685 // [83] PublicID ::= 'PUBLIC' S PubidLiteral 02686 // 02687 bool DTDScanner::scanId( XMLBuffer& pubIdToFill 02688 , XMLBuffer& sysIdToFill 02689 , const IDTypes whatKind) 02690 { 02691 // Clean out both return buffers 02692 pubIdToFill.reset(); 02693 sysIdToFill.reset(); 02694 02695 // 02696 // Check first for the system id first. If we find it, and system id 02697 // is one of the legal values, then lets try to scan it. 02698 // 02699 // 'SYSTEM' S SystemLiteral 02700 if (fReaderMgr->skippedString(XMLUni::fgSysIDString)) 02701 { 02702 // If they were looking for a public id, then we failed 02703 if (whatKind == IDType_Public) 02704 { 02705 fScanner->emitError(XMLErrs::ExpectedPublicId); 02706 return false; 02707 } 02708 02709 // We must skip spaces 02710 bool skippedSomething; 02711 fReaderMgr->skipPastSpaces(skippedSomething); 02712 if (!skippedSomething) 02713 { 02714 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02715 return false; 02716 } 02717 02718 // Get the system literal value 02719 return scanSystemLiteral(sysIdToFill); 02720 } 02721 02722 // Now scan for public id 02723 // 'PUBLIC' S PubidLiteral S SystemLiteral 02724 // or 02725 // 'PUBLIC' S PubidLiteral 02726 02727 // If we don't have any public id string => Error 02728 if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) { 02729 fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId); 02730 return false; 02731 } 02732 02733 // 02734 // So following this we must have whitespace, a public literal, whitespace, 02735 // and a system literal. 02736 // 02737 bool skippedSomething; 02738 fReaderMgr->skipPastSpaces(skippedSomething); 02739 if (!skippedSomething) 02740 { 02741 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02742 02743 // 02744 // Just in case, if they just forgot the whitespace but the next char 02745 // is a single or double quote, then keep going. 02746 // 02747 const XMLCh chPeek = fReaderMgr->peekNextChar(); 02748 if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote)) 02749 return false; 02750 } 02751 02752 if (!scanPublicLiteral(pubIdToFill)) 02753 return false; 02754 02755 // If they wanted a public id, then this is all 02756 if (whatKind == IDType_Public) 02757 return true; 02758 02759 // check if there is any space follows 02760 bool hasSpace; 02761 fReaderMgr->skipPastSpaces(hasSpace); 02762 02763 // 02764 // In order to recover best here we need to see if 02765 // the next thing is a quote or not 02766 // 02767 const XMLCh chPeek = fReaderMgr->peekNextChar(); 02768 const bool bIsQuote = ((chPeek == chDoubleQuote) 02769 || (chPeek == chSingleQuote)); 02770 02771 if (!hasSpace) 02772 { 02773 if (whatKind == IDType_External) 02774 { 02775 // 02776 // If its an external Id, then we need to see the system id. 02777 // So, emit the error. But, if the next char is a quote, don't 02778 // give up since its probably going to work. The user just 02779 // missed the separating space. Otherwise, fail. 02780 // 02781 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02782 if (!bIsQuote) 02783 return false; 02784 } 02785 else 02786 { 02787 // 02788 // We can legally return here. But, if the next char is a quote, 02789 // then that's probably not what was desired, since its probably 02790 // just that space was forgotten and there really is a system 02791 // id to follow. 02792 // 02793 // So treat it like missing whitespace if so and keep going. 02794 // Else, just return success. 02795 // 02796 if (bIsQuote) 02797 fScanner->emitError(XMLErrs::ExpectedWhitespace); 02798 else 02799 return true; 02800 } 02801 } 02802 02803 if (bIsQuote) { 02804 // there is a quote coming, scan the system literal 02805 if (!scanSystemLiteral(sysIdToFill)) 02806 return false; 02807 } 02808 else { 02809 // no quote, if expecting exteral id, this is an error 02810 if (whatKind == IDType_External) 02811 fScanner->emitError(XMLErrs::ExpectedQuotedString); 02812 } 02813 02814 return true; 02815 } 02816 02817 02818 // 02819 // This method will scan the contents of an ignored section. It assumes that 02820 // we already are in the body, i.e. we've seen <![IGNORE[ at this point. So 02821 // we have to just scan until we see a matching ]]> closing markup. 02822 // 02823 void DTDScanner::scanIgnoredSection() 02824 { 02825 // 02826 // Depth starts at one because we are already in one section and want 02827 // to parse until we hit its end. 02828 // 02829 unsigned long depth = 1; 02830 bool gotLeadingSurrogate = false; 02831 while (true) 02832 { 02833 const XMLCh nextCh = fReaderMgr->getNextChar(); 02834 02835 if (!nextCh) 02836 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 02837 02838 if (nextCh == chOpenAngle) 02839 { 02840 if (fReaderMgr->skippedChar(chBang) 02841 && fReaderMgr->skippedChar(chOpenSquare)) 02842 { 02843 depth++; 02844 } 02845 } 02846 else if (nextCh == chCloseSquare) 02847 { 02848 if (fReaderMgr->skippedChar(chCloseSquare)) 02849 { 02850 while (fReaderMgr->skippedChar(chCloseSquare)) 02851 { 02852 // Do nothing, just skip them 02853 } 02854 02855 if (fReaderMgr->skippedChar(chCloseAngle)) 02856 { 02857 depth--; 02858 if (!depth) 02859 break; 02860 } 02861 } 02862 } 02863 // Deal with surrogate pairs 02864 else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 02865 { 02866 // Its a leading surrogate. If we already got one, then 02867 // issue an error, else set leading flag to make sure that 02868 // we look for a trailing next time. 02869 if (gotLeadingSurrogate) 02870 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 02871 else 02872 gotLeadingSurrogate = true; 02873 } 02874 else 02875 { 02876 // If its a trailing surrogate, make sure that we are 02877 // prepared for that. Else, its just a regular char so make 02878 // sure that we were not expected a trailing surrogate. 02879 if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF)) 02880 { 02881 // Its trailing, so make sure we were expecting it 02882 if (!gotLeadingSurrogate) 02883 fScanner->emitError(XMLErrs::Unexpected2ndSurrogateChar); 02884 } 02885 else 02886 { 02887 // Its just a char, so make sure we were not expecting a 02888 // trailing surrogate. 02889 if (gotLeadingSurrogate) 02890 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 02891 02892 // Its got to at least be a valid XML character 02893 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) 02894 { 02895 XMLCh tmpBuf[9]; 02896 XMLString::binToText 02897 ( 02898 nextCh 02899 , tmpBuf 02900 , 8 02901 , 16 02902 , fMemoryManager 02903 ); 02904 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); 02905 } 02906 } 02907 gotLeadingSurrogate = false; 02908 } 02909 } 02910 } 02911 02912 02913 // 02914 // This method scans the entire internal subset. All we can have here is 02915 // decl markup, and PE references. The expanded PE references must contain 02916 // whole markup, so we don't have to worry about their content at this 02917 // level. We just scan them, expand them, push them, and parse their content 02918 // right there, via the expandERef() method. 02919 // 02920 bool DTDScanner::scanInternalSubset() 02921 { 02922 // Indicate we are in the internal subset now 02923 FlagJanitor<bool> janContentFlag(&fInternalSubset, true); 02924 02925 // If we have a doc type handler, tell it the internal subset starts 02926 if (fDocTypeHandler) 02927 fDocTypeHandler->startIntSubset(); 02928 02929 // Get a buffer for whitespace 02930 XMLBufBid bbSpace(fBufMgr); 02931 02932 bool noErrors = true; 02933 while (true) 02934 { 02935 const XMLCh nextCh = fReaderMgr->peekNextChar(); 02936 02937 // 02938 // If we get an end of file marker, just unget it and return a 02939 // failure status. The caller will then see the end of file and 02940 // faill out correctly. 02941 // 02942 if (!nextCh) 02943 return false; 02944 02945 // Watch for the end of internal subset marker 02946 if (nextCh == chCloseSquare) 02947 { 02948 fReaderMgr->getNextChar(); 02949 break; 02950 } 02951 02952 if (nextCh == chPercent) 02953 { 02954 // 02955 // Expand (and scan if external) the reference value. Tell 02956 // it to set the reader to cause an end of entity exception 02957 // when this reader dies, which is what the scanExtSubset 02958 // method wants (who is called to scan this.) 02959 // 02960 fReaderMgr->getNextChar(); 02961 expandPERef(true, false, false, true); 02962 } 02963 else if (nextCh == chOpenAngle) 02964 { 02965 // Remember this reader before we start the scan, for checking 02966 // XML 1.0 P28a Well-formedness constraint: PE Between Declarations 02967 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 02968 bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE); 02969 02970 // And scan this markup 02971 fReaderMgr->getNextChar(); 02972 scanMarkupDecl(false); 02973 02974 // If we did not get back to entry level, then partial markup 02975 if (fReaderMgr->getCurrentReaderNum() != orgReader) { 02976 if (wasInPE) 02977 fScanner->emitError(XMLErrs::PEBetweenDecl); 02978 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always) 02979 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 02980 } 02981 } 02982 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh)) 02983 { 02984 // 02985 // IF we are doing advanced callbacks and have a doc type 02986 // handler, then get the whitespace and call the doc type 02987 // handler with it. Otherwise, just skip whitespace. 02988 // 02989 if (fDocTypeHandler) 02990 { 02991 fReaderMgr->getSpaces(bbSpace.getBuffer()); 02992 fDocTypeHandler->doctypeWhitespace 02993 ( 02994 bbSpace.getRawBuffer() 02995 , bbSpace.getLen() 02996 ); 02997 } 02998 else 02999 { 03000 fReaderMgr->skipPastSpaces(); 03001 } 03002 } 03003 else 03004 { 03005 // Not valid, so emit an error 03006 XMLCh tmpBuf[9]; 03007 XMLString::binToText 03008 ( 03009 fReaderMgr->getNextChar() 03010 , tmpBuf 03011 , 8 03012 , 16 03013 , fMemoryManager 03014 ); 03015 fScanner->emitError 03016 ( 03017 XMLErrs::InvalidCharacterInIntSubset 03018 , tmpBuf 03019 ); 03020 03021 // 03022 // If an '>', then probably an abnormally terminated 03023 // internal subset so just return. 03024 // 03025 if (nextCh == chCloseAngle) 03026 { 03027 noErrors = false; 03028 break; 03029 } 03030 03031 // 03032 // Otherwise, try to sync back up by scanning forward for 03033 // a reasonable start character. 03034 // 03035 static const XMLCh toSkip[] = 03036 { 03037 chPercent, chCloseSquare, chOpenAngle, chNull 03038 }; 03039 fReaderMgr->skipUntilInOrWS(toSkip); 03040 } 03041 } 03042 03043 // If we have a doc type handler, tell it the internal subset ends 03044 if (fDocTypeHandler) 03045 fDocTypeHandler->endIntSubset(); 03046 03047 return noErrors; 03048 } 03049 03050 03051 // 03052 // This method is called once we see a < in the input of an int/ext subset, 03053 // which indicates the start of some sort of markup. 03054 // 03055 void DTDScanner::scanMarkupDecl(const bool parseTextDecl) 03056 { 03057 // 03058 // We only have two valid first characters here. One is a ! which opens 03059 // some markup decl. The other is a ?, which could begin either a PI 03060 // or a text decl. If parseTextDecl is false, we cannot accept a text 03061 // decl. 03062 // 03063 const XMLCh nextCh = fReaderMgr->getNextChar(); 03064 03065 if (nextCh == chBang) 03066 { 03067 if (fReaderMgr->skippedChar(chDash)) 03068 { 03069 if (fReaderMgr->skippedChar(chDash)) 03070 { 03071 scanComment(); 03072 } 03073 else 03074 { 03075 fScanner->emitError(XMLErrs::CommentsMustStartWith); 03076 fReaderMgr->skipPastChar(chCloseAngle); 03077 } 03078 } 03079 else if (fReaderMgr->skippedChar(chOpenSquare)) 03080 { 03081 // 03082 // Its a conditional section. This is only valid in the external 03083 // subset, so issue an error if we aren't there. 03084 // 03085 if (fInternalSubset) 03086 { 03087 fScanner->emitError(XMLErrs::ConditionalSectInIntSubset); 03088 fReaderMgr->skipPastChar(chCloseAngle); 03089 return; 03090 } 03091 03092 // A PE ref can happen here, but space is not required 03093 checkForPERef(false, true); 03094 03095 if (fReaderMgr->skippedString(XMLUni::fgIncludeString)) 03096 { 03097 checkForPERef(false, true); 03098 03099 // Check for the following open square bracket 03100 if (!fReaderMgr->skippedChar(chOpenSquare)) 03101 fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket); 03102 03103 // Get the reader we started this on 03104 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 03105 03106 checkForPERef(false, true); 03107 03108 // 03109 // Recurse back to the ext subset call again, telling it its 03110 // in an include section. 03111 // 03112 scanExtSubsetDecl(true, false); 03113 03114 // 03115 // And see if we got back to the same level. If not, then its 03116 // a partial markup error. 03117 // 03118 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always) 03119 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 03120 03121 } 03122 else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString)) 03123 { 03124 checkForPERef(false, true); 03125 03126 // Check for the following open square bracket 03127 if (!fReaderMgr->skippedChar(chOpenSquare)) 03128 fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket); 03129 03130 // Get the reader we started this on 03131 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum(); 03132 03133 // And scan over the ignored part 03134 scanIgnoredSection(); 03135 03136 // 03137 // And see if we got back to the same level. If not, then its 03138 // a partial markup error. 03139 // 03140 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always) 03141 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE); 03142 03143 } 03144 else 03145 { 03146 fScanner->emitError(XMLErrs::ExpectedIncOrIgn); 03147 fReaderMgr->skipPastChar(chCloseAngle); 03148 } 03149 } 03150 else if (fReaderMgr->skippedString(XMLUni::fgAttListString)) 03151 { 03152 scanAttListDecl(); 03153 } 03154 else if (fReaderMgr->skippedString(XMLUni::fgElemString)) 03155 { 03156 scanElementDecl(); 03157 } 03158 else if (fReaderMgr->skippedString(XMLUni::fgEntityString)) 03159 { 03160 scanEntityDecl(); 03161 } 03162 else if (fReaderMgr->skippedString(XMLUni::fgNotationString)) 03163 { 03164 scanNotationDecl(); 03165 } 03166 else 03167 { 03168 fScanner->emitError(XMLErrs::ExpectedMarkupDecl); 03169 fReaderMgr->skipPastChar(chCloseAngle); 03170 } 03171 } 03172 else if (nextCh == chQuestion) 03173 { 03174 // It could be a PI or the XML declaration. Check for Decl 03175 if (fScanner->checkXMLDecl(false)) 03176 { 03177 // If we are not accepting text decls, its an error 03178 if (parseTextDecl) 03179 { 03180 scanTextDecl(); 03181 } 03182 else 03183 { 03184 // Emit the error and skip past this markup 03185 fScanner->emitError(XMLErrs::TextDeclNotLegalHere); 03186 fReaderMgr->skipPastChar(chCloseAngle); 03187 } 03188 } 03189 else 03190 { 03191 // It has to be a PI 03192 scanPI(); 03193 } 03194 } 03195 else 03196 { 03197 // Can't be valid so emit error and try to skip past end of this decl 03198 fScanner->emitError(XMLErrs::ExpectedMarkupDecl); 03199 fReaderMgr->skipPastChar(chCloseAngle); 03200 } 03201 } 03202 03203 03204 // 03205 // This method is called for a mixed model element's content mode. We've 03206 // already scanned past the '(PCDATA' part by the time we get here. So 03207 // everything else is element names separated by | characters until we 03208 // hit the end. The passed element decl's content model is filled in with 03209 // the information found. 03210 // 03211 bool DTDScanner::scanMixed(DTDElementDecl& toFill) 03212 { 03213 // 03214 // The terminating star is only required if there is something more 03215 // than (PCDATA). 03216 // 03217 bool starRequired = false; 03218 03219 // Get a buffer to be used below to get element names 03220 XMLBufBid bbName(fBufMgr); 03221 XMLBuffer& nameBuf = bbName.getBuffer(); 03222 03223 // 03224 // Create an initial content spec node. Its just a leaf node with a 03225 // PCDATA element id. This current node pointer will be pushed down the 03226 // tree as we go. 03227 // 03228 ContentSpecNode* curNode = new (fGrammarPoolMemoryManager) ContentSpecNode 03229 ( 03230 new (fGrammarPoolMemoryManager) QName 03231 ( 03232 XMLUni::fgZeroLenString 03233 , XMLUni::fgZeroLenString 03234 , XMLElementDecl::fgPCDataElemId 03235 , fGrammarPoolMemoryManager 03236 ) 03237 , false 03238 , fGrammarPoolMemoryManager 03239 ); 03240 03241 // 03242 // Set the initial leaf as the temporary head. If we hit the first choice 03243 // node, it will be set up here. When done, this is the node that's set 03244 // as the content spec for the element. 03245 // 03246 ContentSpecNode* headNode = curNode; 03247 03248 // Remember the original node so we can sense the first choice node 03249 ContentSpecNode* orgNode = curNode; 03250 03251 // 03252 // We just loop around, getting the | character at the top and then 03253 // looking for the next element name. We keep up with the last node 03254 // and add each new one to its right node. 03255 // 03256 while (true) 03257 { 03258 // 03259 // First of all we check for some grunt work details of skipping 03260 // whitespace, expand PE refs, and catching invalid reps. 03261 // 03262 if (fReaderMgr->lookingAtChar(chPercent)) 03263 { 03264 // Expand it and continue 03265 checkForPERef(false, true); 03266 } 03267 else if (fReaderMgr->skippedChar(chAsterisk)) 03268 { 03269 // 03270 // Tell them they can't have reps in mixed model, but eat 03271 // it and keep going if we are allowed to. 03272 // 03273 if (fScanner->emitErrorWillThrowException(XMLErrs::NoRepInMixed)) 03274 { 03275 delete headNode; 03276 } 03277 fScanner->emitError(XMLErrs::NoRepInMixed); 03278 } 03279 else if (fReaderMgr->skippedSpace()) 03280 { 03281 // Spaces are ok at this point, just eat them and continue 03282 fReaderMgr->skipPastSpaces(); 03283 } 03284 else 03285 { 03286 if (!fReaderMgr->skippedChar(chPipe)) 03287 { 03288 // Has to be the closing paren now. 03289 if (!fReaderMgr->skippedChar(chCloseParen)) 03290 { 03291 delete headNode; 03292 fScanner->emitError(XMLErrs::UnterminatedContentModel, toFill.getElementName()->getLocalPart()); 03293 return false; 03294 } 03295 03296 bool starSkipped = true; 03297 if (!fReaderMgr->skippedChar(chAsterisk)) { 03298 03299 starSkipped = false; 03300 03301 if (starRequired) 03302 { 03303 if (fScanner->emitErrorWillThrowException(XMLErrs::ExpectedAsterisk)) 03304 { 03305 delete headNode; 03306 } 03307 fScanner->emitError(XMLErrs::ExpectedAsterisk); 03308 } 03309 } 03310 03311 // 03312 // Create a zero or more node and make the original head 03313 // node its first child. 03314 // 03315 if (starRequired || starSkipped) { 03316 headNode = new (fGrammarPoolMemoryManager) ContentSpecNode 03317 ( 03318 ContentSpecNode::ZeroOrMore 03319 , headNode 03320 , 0 03321 , true 03322 , true 03323 , fGrammarPoolMemoryManager 03324 ); 03325 } 03326 03327 // Store the head node as the content spec of the element. 03328 toFill.setContentSpec(headNode); 03329 break; 03330 } 03331 03332 // Its more than just a PCDATA, so an ending star will be required now 03333 starRequired = true; 03334 03335 // Space is legal here so check for a PE ref, but don't require space 03336 checkForPERef(false, true); 03337 03338 // Get a name token 03339 if (!fReaderMgr->getName(nameBuf)) 03340 { 03341 delete headNode; 03342 fScanner->emitError(XMLErrs::ExpectedElementName); 03343 return false; 03344 } 03345 03346 // 03347 // Create a leaf node for it. If we can find the element id for 03348 // this element, then use it. Else, we have to fault in an element 03349 // decl, marked as created because of being in a content model. 03350 // 03351 XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE); 03352 if (!decl) 03353 { 03354 decl = new (fGrammarPoolMemoryManager) DTDElementDecl 03355 ( 03356 nameBuf.getRawBuffer() 03357 , fEmptyNamespaceId 03358 , DTDElementDecl::Any 03359 , fGrammarPoolMemoryManager 03360 ); 03361 decl->setCreateReason(XMLElementDecl::InContentModel); 03362 decl->setExternalElemDeclaration(isReadingExternalEntity()); 03363 fDTDGrammar->putElemDecl(decl); 03364 } 03365 03366 // 03367 // If the current node is the original node, this is the first choice 03368 // node, so create an initial choice node with the current node and 03369 // the new element id. Store this as the head node. 03370 // 03371 // Otherwise, we have to steal the right node of the previous choice 03372 // and weave in another choice node there, which has the old choice 03373 // as its left and the new leaf as its right. 03374 // 03375 if (curNode == orgNode) 03376 { 03377 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode 03378 ( 03379 ContentSpecNode::Choice 03380 , curNode 03381 , new (fGrammarPoolMemoryManager) ContentSpecNode 03382 ( 03383 decl->getElementName() 03384 , fGrammarPoolMemoryManager 03385 ) 03386 , true 03387 , true 03388 , fGrammarPoolMemoryManager 03389 ); 03390 03391 // Remember the top node 03392 headNode = curNode; 03393 } 03394 else 03395 { 03396 ContentSpecNode* oldRight = curNode->orphanSecond(); 03397 curNode->setSecond 03398 ( 03399 new (fGrammarPoolMemoryManager) ContentSpecNode 03400 ( 03401 ContentSpecNode::Choice 03402 , oldRight 03403 , new (fGrammarPoolMemoryManager) ContentSpecNode 03404 ( 03405 decl->getElementName() 03406 , fGrammarPoolMemoryManager 03407 ) 03408 , true 03409 , true 03410 , fGrammarPoolMemoryManager 03411 ) 03412 ); 03413 03414 // Make the new right node the current node 03415 curNode = curNode->getSecond(); 03416 } 03417 } 03418 } 03419 03420 return true; 03421 } 03422 03423 03424 // 03425 // This method is called when we see a '<!NOTATION' string while scanning 03426 // markup decl. It parses out the notation and its id and stores a new 03427 // notation decl object in the notation decl pool. 03428 // 03429 void DTDScanner::scanNotationDecl() 03430 { 03431 // Space is required here so check for a PE ref, and require space 03432 if (!checkForPERef(false, true)) 03433 { 03434 fScanner->emitError(XMLErrs::ExpectedWhitespace); 03435 fReaderMgr->skipPastChar(chCloseAngle); 03436 return; 03437 } 03438 03439 // 03440 // And now we get a name, which is the name of the notation. Get a 03441 // buffer for the name. 03442 // 03443 XMLBufBid bbName(fBufMgr); 03444 if (!fReaderMgr->getName(bbName.getBuffer())) 03445 { 03446 fScanner->emitError(XMLErrs::ExpectedNotationName); 03447 fReaderMgr->skipPastChar(chCloseAngle); 03448 return; 03449 } 03450 03451 // If namespaces are enabled, then no colons allowed 03452 if (fScanner->getDoNamespaces()) 03453 { 03454 if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1) 03455 fScanner->emitError(XMLErrs::ColonNotLegalWithNS); 03456 } 03457 03458 // Space is required here so check for a PE ref, and require space 03459 if (!checkForPERef(false, true)) 03460 { 03461 fScanner->emitError(XMLErrs::ExpectedWhitespace); 03462 fReaderMgr->skipPastChar(chCloseAngle); 03463 return; 03464 } 03465 03466 // 03467 // And scan an external or public id. We need buffers to use for both 03468 // of these. 03469 // 03470 XMLBufBid bbPubId(fBufMgr); 03471 XMLBufBid bbSysId(fBufMgr); 03472 if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either)) 03473 { 03474 fReaderMgr->skipPastChar(chCloseAngle); 03475 return; 03476 } 03477 03478 // We can have an optional space or PE ref here 03479 checkForPERef(false, true); 03480 03481 // 03482 // See if it already exists. If so, add it to the notatino decl pool. 03483 // Otherwise, if advanced callbacks are on, create a temp one and 03484 // call out for that one. 03485 // 03486 XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer()); 03487 bool isIgnoring = (decl != 0); 03488 if (isIgnoring) 03489 { 03490 fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer()); 03491 } 03492 else 03493 { 03494 // Fill in a new notation declaration and add it to the pool 03495 const XMLCh* publicId = bbPubId.getRawBuffer(); 03496 const XMLCh* systemId = bbSysId.getRawBuffer(); 03497 ReaderMgr::LastExtEntityInfo lastInfo; 03498 fReaderMgr->getLastExtEntityInfo(lastInfo); 03499 03500 decl = new (fGrammarPoolMemoryManager) XMLNotationDecl 03501 ( 03502 bbName.getRawBuffer() 03503 , (publicId && *publicId) ? publicId : 0 03504 , (systemId && *systemId) ? systemId : 0 03505 , (lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0 03506 , fGrammarPoolMemoryManager 03507 ); 03508 fDTDGrammar->putNotationDecl(decl); 03509 } 03510 03511 // 03512 // If we have a document type handler, then tell it about this. If we 03513 // are ignoring it, only call out if advanced callbacks are enabled. 03514 // 03515 if (fDocTypeHandler) 03516 { 03517 fDocTypeHandler->notationDecl 03518 ( 03519 *decl 03520 , isIgnoring 03521 ); 03522 } 03523 03524 // And one more optional space or PE ref 03525 checkForPERef(false, true); 03526 03527 // And skip the terminating bracket 03528 if (!fReaderMgr->skippedChar(chCloseAngle)) 03529 fScanner->emitError(XMLErrs::UnterminatedNotationDecl); 03530 } 03531 03532 03533 // 03534 // Scans a PI and calls the appropriate callbacks. A PI can happen in either 03535 // the document or the DTD, so it calls the appropriate handler according 03536 // to the fInDocument flag. 03537 // 03538 // At entry we have just scanned the <? part, and need to now start on the 03539 // PI target name. 03540 // 03541 void DTDScanner::scanPI() 03542 { 03543 const XMLCh* namePtr = 0; 03544 const XMLCh* targetPtr = 0; 03545 03546 // 03547 // If there are any spaces here, then warn about it. If we aren't in 03548 // 'first error' mode, then we'll come back and can easily pick up 03549 // again by just skipping them. 03550 // 03551 if (fReaderMgr->lookingAtSpace()) 03552 { 03553 fScanner->emitError(XMLErrs::PINameExpected); 03554 fReaderMgr->skipPastSpaces(); 03555 } 03556 03557 // Get a buffer for the PI name and scan it in 03558 XMLBufBid bbName(fBufMgr); 03559 if (!fReaderMgr->getName(bbName.getBuffer())) 03560 { 03561 fScanner->emitError(XMLErrs::PINameExpected); 03562 fReaderMgr->skipPastChar(chCloseAngle); 03563 return; 03564 } 03565 03566 // Point the name pointer at the raw data 03567 namePtr = bbName.getRawBuffer(); 03568 03569 // See if it issome form of 'xml' and emit a warning 03570 //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString)) 03571 if (bbName.getLen() == 3 && 03572 (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) && 03573 ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) && 03574 ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L)))) 03575 fScanner->emitError(XMLErrs::NoPIStartsWithXML); 03576 03577 // If namespaces are enabled, then no colons allowed 03578 if (fScanner->getDoNamespaces()) 03579 { 03580 if (XMLString::indexOf(namePtr, chColon) != -1) 03581 fScanner->emitError(XMLErrs::ColonNotLegalWithNS); 03582 } 03583 03584 // 03585 // If we don't hit a space next, then the PI has no target. If we do 03586 // then get out the target. Get a buffer for it as well 03587 // 03588 XMLBufBid bbTarget(fBufMgr); 03589 if (fReaderMgr->skippedSpace()) 03590 { 03591 // Skip any leading spaces 03592 fReaderMgr->skipPastSpaces(); 03593 03594 bool gotLeadingSurrogate = false; 03595 03596 // It does have a target, so lets move on to deal with that. 03597 while (1) 03598 { 03599 const XMLCh nextCh = fReaderMgr->getNextChar(); 03600 03601 // Watch for an end of file, which is always bad here 03602 if (!nextCh) 03603 { 03604 fScanner->emitError(XMLErrs::UnterminatedPI); 03605 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 03606 } 03607 03608 // Watch for potential terminating character 03609 if (nextCh == chQuestion) 03610 { 03611 // It must be followed by '>' to be a termination of the target 03612 if (fReaderMgr->skippedChar(chCloseAngle)) 03613 break; 03614 } 03615 03616 // Check for correct surrogate pairs 03617 if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) 03618 { 03619 if (gotLeadingSurrogate) 03620 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 03621 else 03622 gotLeadingSurrogate = true; 03623 } 03624 else 03625 { 03626 if (gotLeadingSurrogate) 03627 { 03628 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF)) 03629 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); 03630 } 03631 // Its got to at least be a valid XML character 03632 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) { 03633 03634 XMLCh tmpBuf[9]; 03635 XMLString::binToText 03636 ( 03637 nextCh 03638 , tmpBuf 03639 , 8 03640 , 16 03641 , fMemoryManager 03642 ); 03643 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); 03644 } 03645 03646 gotLeadingSurrogate = false; 03647 } 03648 bbTarget.append(nextCh); 03649 } 03650 } 03651 else 03652 { 03653 // No target, but make sure its terminated ok 03654 if (!fReaderMgr->skippedChar(chQuestion)) 03655 { 03656 fScanner->emitError(XMLErrs::UnterminatedPI); 03657 fReaderMgr->skipPastChar(chCloseAngle); 03658 return; 03659 } 03660 03661 if (!fReaderMgr->skippedChar(chCloseAngle)) 03662 { 03663 fScanner->emitError(XMLErrs::UnterminatedPI); 03664 fReaderMgr->skipPastChar(chCloseAngle); 03665 return; 03666 } 03667 } 03668 03669 // Point the target pointer at the raw data 03670 targetPtr = bbTarget.getRawBuffer(); 03671 03672 // 03673 // If we have a handler, then call it. 03674 // 03675 if (fDocTypeHandler) 03676 { 03677 fDocTypeHandler->doctypePI 03678 ( 03679 namePtr 03680 , targetPtr 03681 ); 03682 } 03683 } 03684 03685 03686 // 03687 // This method scans a public literal. It must be quoted and all of its 03688 // characters must be valid public id characters. The quotes are discarded 03689 // and the results are returned. 03690 // 03691 bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill) 03692 { 03693 toFill.reset(); 03694 03695 // Get the next char which must be a single or double quote 03696 XMLCh quoteCh; 03697 if (!fReaderMgr->skipIfQuote(quoteCh)) { 03698 fScanner->emitError(XMLErrs::ExpectedQuotedString); 03699 return false; 03700 } 03701 03702 while (true) 03703 { 03704 const XMLCh nextCh = fReaderMgr->getNextChar(); 03705 03706 // Watch for EOF 03707 if (!nextCh) 03708 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 03709 03710 if (nextCh == quoteCh) 03711 break; 03712 03713 // 03714 // If its not a valid public id char, then report it but keep going 03715 // since that's the best recovery scheme. 03716 // 03717 if (!fReaderMgr->getCurrentReader()->isPublicIdChar(nextCh)) 03718 { 03719 XMLCh tmpBuf[9]; 03720 XMLString::binToText 03721 ( 03722 nextCh 03723 , tmpBuf 03724 , 8 03725 , 16 03726 , fMemoryManager 03727 ); 03728 fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf); 03729 } 03730 03731 toFill.append(nextCh); 03732 } 03733 return true; 03734 } 03735 03736 03737 // 03738 // This method handles scanning in a quoted system literal. It expects to 03739 // start on the open quote and returns after eating the ending quote. There 03740 // are not really any restrictions on the contents of system literals. 03741 // 03742 bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill) 03743 { 03744 toFill.reset(); 03745 03746 // Get the next char which must be a single or double quote 03747 XMLCh quoteCh; 03748 if (!fReaderMgr->skipIfQuote(quoteCh)) { 03749 fScanner->emitError(XMLErrs::ExpectedQuotedString); 03750 return false; 03751 } 03752 03753 XMLCh nextCh; 03754 // Break out on terminating quote 03755 while ((nextCh=fReaderMgr->getNextChar())!=quoteCh) 03756 { 03757 // Watch for EOF 03758 if (!nextCh) 03759 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager); 03760 toFill.append(nextCh); 03761 } 03762 return true; 03763 } 03764 03765 03766 03767 // 03768 // This method is called to scan a text decl line, which can be the first 03769 // line in an external entity or external subset. 03770 // 03771 // On entry the <? has been scanned, and next should be 'xml' followed by 03772 // some whitespace, version string, etc... 03773 // [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>' 03774 // 03775 void DTDScanner::scanTextDecl() 03776 { 03777 // Skip any subsequent whitespace before the version string 03778 fReaderMgr->skipPastSpaces(); 03779 03780 // Next should be the version string 03781 XMLBufBid bbVersion(fBufMgr); 03782 if (fReaderMgr->skippedString(XMLUni::fgVersionString)) 03783 { 03784 if (!scanEq()) 03785 { 03786 fScanner->emitError(XMLErrs::ExpectedEqSign); 03787 fReaderMgr->skipPastChar(chCloseAngle); 03788 return; 03789 } 03790 03791 // 03792 // Followed by a single or double quoted version. Get a buffer for 03793 // the string. 03794 // 03795 if (!getQuotedString(bbVersion.getBuffer())) 03796 { 03797 fScanner->emitError(XMLErrs::BadXMLVersion); 03798 fReaderMgr->skipPastChar(chCloseAngle); 03799 return; 03800 } 03801 03802 // If its not our supported version, issue an error but continue 03803 if (XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_1)) { 03804 if (fScanner->getXMLVersion() != XMLReader::XMLV1_1) 03805 fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer()); 03806 } 03807 else if (!XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_0)) 03808 fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer()); 03809 } 03810 03811 // Ok, now we must have an encoding string 03812 XMLBufBid bbEncoding(fBufMgr); 03813 fReaderMgr->skipPastSpaces(); 03814 bool gotEncoding = false; 03815 if (fReaderMgr->skippedString(XMLUni::fgEncodingString)) 03816 { 03817 // There must be a equal sign next 03818 if (!scanEq()) 03819 { 03820 fScanner->emitError(XMLErrs::ExpectedEqSign); 03821 fReaderMgr->skipPastChar(chCloseAngle); 03822 return; 03823 } 03824 03825 // Followed by a single or double quoted version string 03826 getQuotedString(bbEncoding.getBuffer()); 03827 if (bbEncoding.isEmpty() || !XMLString::isValidEncName(bbEncoding.getRawBuffer())) 03828 { 03829 fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer()); 03830 fReaderMgr->skipPastChar(chCloseAngle); 03831 return; 03832 } 03833 03834 // Indicate that we got an encoding 03835 gotEncoding = true; 03836 } 03837 03838 // 03839 // Encoding declarations are required in the external entity 03840 // if there is a text declaration present 03841 // 03842 if (!gotEncoding) 03843 { 03844 fScanner->emitError(XMLErrs::EncodingRequired); 03845 fReaderMgr->skipPastChar(chCloseAngle); 03846 return; 03847 03848 } 03849 03850 fReaderMgr->skipPastSpaces(); 03851 if (!fReaderMgr->skippedChar(chQuestion)) 03852 { 03853 fScanner->emitError(XMLErrs::UnterminatedXMLDecl); 03854 fReaderMgr->skipPastChar(chCloseAngle); 03855 } 03856 else if (!fReaderMgr->skippedChar(chCloseAngle)) 03857 { 03858 fScanner->emitError(XMLErrs::UnterminatedXMLDecl); 03859 fReaderMgr->skipPastChar(chCloseAngle); 03860 } 03861 03862 // 03863 // If we have a document type handler and advanced callbacks are on, 03864 // then call the TextDecl callback 03865 // 03866 if (fDocTypeHandler) 03867 { 03868 fDocTypeHandler->TextDecl 03869 ( 03870 bbVersion.getRawBuffer() 03871 , bbEncoding.getRawBuffer() 03872 ); 03873 } 03874 03875 // 03876 // If we got an encoding string, then we have to call back on the reader 03877 // to tell it what the encoding is. 03878 // 03879 if (!bbEncoding.isEmpty()) 03880 { 03881 if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer())) 03882 fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer()); 03883 } 03884 } 03885 03886 XERCES_CPP_NAMESPACE_END