GME: DTDScanner.cpp Source File

Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: DTDScanner.cpp 833045 2009-11-05 13:21:27Z borisk $
00020  */
00021 
00022 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/util/BinMemInputStream.hpp>
00027 #include <xercesc/util/FlagJanitor.hpp>
00028 #include <xercesc/util/Janitor.hpp>
00029 #include <xercesc/util/XMLUniDefs.hpp>
00030 #include <xercesc/util/ValueStackOf.hpp>
00031 #include <xercesc/util/UnexpectedEOFException.hpp>
00032 #include <xercesc/util/OutOfMemoryException.hpp>
00033 #include <xercesc/sax/InputSource.hpp>
00034 #include <xercesc/framework/XMLDocumentHandler.hpp>
00035 #include <xercesc/framework/XMLEntityHandler.hpp>
00036 #include <xercesc/framework/XMLValidator.hpp>
00037 #include <xercesc/internal/EndOfEntityException.hpp>
00038 #include <xercesc/internal/XMLScanner.hpp>
00039 #include <xercesc/validators/common/ContentSpecNode.hpp>
00040 #include <xercesc/validators/common/MixedContentModel.hpp>
00041 #include <xercesc/validators/DTD/DTDEntityDecl.hpp>
00042 #include <xercesc/validators/DTD/DocTypeHandler.hpp>
00043 #include <xercesc/validators/DTD/DTDScanner.hpp>
00044 
00045 XERCES_CPP_NAMESPACE_BEGIN
00046 
00047 // ---------------------------------------------------------------------------
00048 //  Local methods
00049 // ---------------------------------------------------------------------------
00050 //
00051 //  This method automates the grunt work of looking at a char and see if its
00052 //  a repetition suffix. If so, it creates a new correct rep node and wraps
00053 //  the pass node in it. Otherwise, it returns the previous node.
00054 //
00055 static ContentSpecNode* makeRepNode(const XMLCh testCh,
00056                                     ContentSpecNode* const prevNode,
00057                                     MemoryManager* const manager)
00058 {
00059     if (testCh == chQuestion)
00060     {
00061         return new (manager) ContentSpecNode
00062         (
00063             ContentSpecNode::ZeroOrOne
00064             , prevNode
00065             , 0
00066             , true
00067             , true
00068             , manager
00069         );
00070     }
00071      else if (testCh == chPlus)
00072     {
00073         return new (manager) ContentSpecNode
00074         (
00075             ContentSpecNode::OneOrMore
00076             , prevNode
00077             , 0
00078             , true
00079             , true
00080             , manager
00081         );
00082     }
00083      else if (testCh == chAsterisk)
00084     {
00085         return new (manager) ContentSpecNode
00086         (
00087             ContentSpecNode::ZeroOrMore
00088             , prevNode
00089             , 0
00090             , true
00091             , true
00092             , manager
00093         );
00094     }
00095 
00096     // Just return the incoming node
00097     return prevNode;
00098 }
00099 
00100 // ---------------------------------------------------------------------------
00101 //  DTDValidator: Constructors and Destructor
00102 // ---------------------------------------------------------------------------
00103 DTDScanner::DTDScanner( DTDGrammar*           dtdGrammar
00104                       , DocTypeHandler* const docTypeHandler
00105                       , MemoryManager* const  grammarPoolMemoryManager
00106                       , MemoryManager* const  manager) :
00107     fMemoryManager(manager)
00108     , fGrammarPoolMemoryManager(grammarPoolMemoryManager)
00109     , fDocTypeHandler(docTypeHandler)
00110     , fDumAttDef(0)
00111     , fDumElemDecl(0)
00112     , fDumEntityDecl(0)
00113     , fInternalSubset(false)
00114     , fNextAttrId(1)
00115     , fDTDGrammar(dtdGrammar)
00116     , fBufMgr(0)
00117     , fReaderMgr(0)
00118     , fScanner(0)
00119     , fPEntityDeclPool(0)
00120     , fEmptyNamespaceId(0)
00121     , fDocTypeReaderId(0)
00122 {
00123     fPEntityDeclPool = new (fMemoryManager) NameIdPool<DTDEntityDecl>(109, 128, fMemoryManager);
00124 }
00125 
00126 DTDScanner::~DTDScanner()
00127 {
00128     delete fDumAttDef;
00129     delete fDumElemDecl;
00130     delete fDumEntityDecl;
00131     delete fPEntityDeclPool;
00132 }
00133 
00134 // -----------------------------------------------------------------------
00135 //  Setter methods
00136 // -----------------------------------------------------------------------
00137 void DTDScanner::setScannerInfo(XMLScanner* const      owningScanner
00138                             , ReaderMgr* const      readerMgr
00139                             , XMLBufferMgr* const   bufMgr)
00140 {
00141     // We don't own any of these, we just reference them
00142     fScanner = owningScanner;
00143     fReaderMgr = readerMgr;
00144     fBufMgr = bufMgr;
00145 
00146     if (fScanner->getDoNamespaces())
00147         fEmptyNamespaceId = fScanner->getEmptyNamespaceId();
00148     else
00149         fEmptyNamespaceId = 0;
00150 
00151     fDocTypeReaderId = fReaderMgr->getCurrentReaderNum();
00152 }
00153 
00154 
00155 // ---------------------------------------------------------------------------
00156 //  DTDScanner: Private scanning methods
00157 // ---------------------------------------------------------------------------
00158 bool DTDScanner::checkForPERef(   const bool    inLiteral
00159                                 , const bool    inMarkup)
00160 {
00161     bool gotSpace = false;
00162 
00163     //
00164     //  See if we have any spaces up front. If so, then skip them and set
00165     //  the gotSpaces flag.
00166     //
00167     if (fReaderMgr->skippedSpace())
00168     {
00169         fReaderMgr->skipPastSpaces();
00170         gotSpace = true;
00171     }
00172 
00173     // If the next char is a percent, then expand the PERef
00174     if (!fReaderMgr->skippedChar(chPercent))
00175        return gotSpace;
00176 
00177     while (true)
00178     {
00179        if (!expandPERef(false, inLiteral, inMarkup, false))
00180           fScanner->emitError(XMLErrs::ExpectedEntityRefName);
00181        // And skip any more spaces in the expanded value
00182        if (fReaderMgr->skippedSpace())
00183        {
00184           fReaderMgr->skipPastSpaces();
00185           gotSpace = true;
00186        }
00187        if (!fReaderMgr->skippedChar(chPercent))
00188           break;
00189     }
00190     return gotSpace;
00191 }
00192 
00193 
00194 bool DTDScanner::expandPERef( const   bool    scanExternal
00195                                 , const bool    inLiteral
00196                                 , const bool    inMarkup
00197                                 , const bool    throwEndOfExt)
00198 {
00199     fScanner->setHasNoDTD(false);
00200     XMLBufBid bbName(fBufMgr);
00201 
00202     //
00203     //  If we are in the internal subset and in markup, then this is
00204     //  an error but we go ahead and do it anyway.
00205     //
00206     if (fInternalSubset && inMarkup)
00207         fScanner->emitError(XMLErrs::PERefInMarkupInIntSubset);
00208 
00209     if (!fReaderMgr->getName(bbName.getBuffer()))
00210     {
00211         fScanner->emitError(XMLErrs::ExpectedPEName);
00212 
00213         // Skip the semicolon if that's what we ended up on
00214         fReaderMgr->skippedChar(chSemiColon);
00215         return false;
00216     }
00217 
00218     // If no terminating semicolon, emit an error but try to keep going
00219     if (!fReaderMgr->skippedChar(chSemiColon))
00220         fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
00221 
00222     //
00223     //  Look it up in the PE decl pool and see if it exists. If not, just
00224     //  emit an error and continue.
00225     //
00226     XMLEntityDecl* decl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
00227     if (!decl)
00228     {
00229         // XML 1.0 Section 4.1
00230         if (fScanner->getStandalone()) {
00231             // no need to check fScanner->fHasNoDTD which is for sure false
00232             // since we are in expandPERef already
00233             fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
00234         }
00235         else {
00236             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
00237                 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
00238         }
00239 
00240         return false;
00241     }
00242 
00243     //
00244     // XML 1.0 Section 2.9
00245     //  If we are a standalone document, then it has to have been declared
00246     //  in the internal subset. Keep going though.
00247     //
00248     if (fScanner->getValidationScheme() == XMLScanner::Val_Always && fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
00249         fScanner->getValidator()->emitError(XMLValid::VC_IllegalRefInStandalone, bbName.getRawBuffer());
00250 
00251     //
00252     //  Okee dokee, we found it. So create either a memory stream with
00253     //  the entity value contents, or a file stream if its an external
00254     //  entity.
00255     //
00256     if (decl->isExternal())
00257     {
00258         // And now create a reader to read this entity
00259         InputSource* srcUsed;
00260         XMLReader* reader = fReaderMgr->createReader
00261         (
00262             decl->getBaseURI()
00263             , decl->getSystemId()
00264             , decl->getPublicId()
00265             , false
00266             , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
00267             , XMLReader::Type_PE
00268             , XMLReader::Source_External
00269             , srcUsed
00270             , fScanner->getCalculateSrcOfs()
00271             , fScanner->getLowWaterMark()
00272             , fScanner->getDisableDefaultEntityResolution()
00273         );
00274 
00275         // Put a janitor on the source so its cleaned up on exit
00276         Janitor<InputSource> janSrc(srcUsed);
00277 
00278         // If the creation failed then throw an exception
00279         if (!reader)
00280             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
00281 
00282         // Set the 'throw at end' flag, to the one we were given
00283         reader->setThrowAtEnd(throwEndOfExt);
00284 
00285         //
00286         //  Push the reader. If its a recursive expansion, then emit an error
00287         //  and return an failure.
00288         //
00289         if (!fReaderMgr->pushReader(reader, decl))
00290         {
00291             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
00292             return false;
00293         }
00294 
00295         //
00296         //  If the caller wants us to scan the external entity, then lets
00297         //  do that now.
00298         //
00299         if (scanExternal)
00300         {
00301             XMLEntityHandler* entHandler = fScanner->getEntityHandler();
00302 
00303             // If we have an entity handler, tell it we are starting this entity
00304             if (entHandler)
00305                 entHandler->startInputSource(*srcUsed);
00306 
00307             //
00308             //  Scan the external entity now. The parameter tells it that
00309             //  it is not in an include section. Get the current reader
00310             //  level so we can catch partial markup errors and be sure
00311             //  to get back to here if we get an exception out of the
00312             //  ext subset scan.
00313             //
00314             const XMLSize_t readerNum = fReaderMgr->getCurrentReaderNum();
00315             try
00316             {
00317                 scanExtSubsetDecl(false, false);
00318             }
00319             catch(const OutOfMemoryException&)
00320             {
00321                 throw;
00322             }
00323             catch(...)
00324             {
00325                 // Pop the reader back to the original level
00326                 fReaderMgr->cleanStackBackTo(readerNum);
00327 
00328                 // End the input source, even though its not happy
00329                 if (entHandler)
00330                     entHandler->endInputSource(*srcUsed);
00331                 throw;
00332             }
00333 
00334             // If we have an entity handler, tell it we are ending this entity
00335             if (entHandler)
00336                 entHandler->endInputSource(*srcUsed);
00337         }
00338         else {
00339             // If it starts with the XML string, then parse a text decl
00340             if (fScanner->checkXMLDecl(true))
00341                 scanTextDecl();
00342         }
00343     }
00344      else
00345     {
00346         // Create a reader over a memory stream over the entity value
00347         XMLReader* valueReader = fReaderMgr->createIntEntReader
00348         (
00349             decl->getName()
00350             , inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
00351             , XMLReader::Type_PE
00352             , decl->getValue()
00353             , decl->getValueLen()
00354             , false
00355         );
00356 
00357         //
00358         //  Trt to push the entity reader onto the reader manager stack,
00359         //  where it will become the subsequent input. If it fails, that
00360         //  means the entity is recursive, so issue an error. The reader
00361         //  will have just been discarded, but we just keep going.
00362         //
00363         if (!fReaderMgr->pushReader(valueReader, decl))
00364             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
00365     }
00366 
00367     return true;
00368 }
00369 
00370 
00371 bool DTDScanner::getQuotedString(XMLBuffer& toFill)
00372 {
00373     // Reset the target buffer
00374     toFill.reset();
00375 
00376     // Get the next char which must be a single or double quote
00377     XMLCh quoteCh;
00378     if (!fReaderMgr->skipIfQuote(quoteCh))
00379         return false;
00380 
00381         XMLCh nextCh;
00382     // Get another char and see if it matches the starting quote char
00383     while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
00384     {
00385         //
00386         //  We should never get either an end of file null char here. If we
00387         //  do, just fail. It will be handled more gracefully in the higher
00388         //  level code that called us.
00389         //
00390         if (!nextCh)
00391             return false;
00392 
00393         // Else add it to the buffer
00394         toFill.append(nextCh);
00395     }
00396     return true;
00397 }
00398 
00399 
00400 XMLAttDef*
00401 DTDScanner::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
00402 {
00403     // Check for PE ref or optional whitespace
00404     checkForPERef(false, true);
00405 
00406     // Get the name of the attribute
00407     if (!fReaderMgr->getName(bufToUse))
00408     {
00409         fScanner->emitError(XMLErrs::ExpectedAttrName);
00410         return 0;
00411     }
00412 
00413     //
00414     //  Look up this attribute in the parent element's attribute list. If
00415     //  it already exists, then use the dummy.
00416     //
00417     DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
00418     if (decl)
00419     {
00420         // It already exists, so put out a warning
00421         fScanner->emitError
00422         (
00423             XMLErrs::AttListAlreadyExists
00424             , bufToUse.getRawBuffer()
00425             , parentElem.getFullName()
00426         );
00427 
00428         // Use the dummy decl to parse into and set its name to the name we got
00429         if (!fDumAttDef)
00430         {
00431             fDumAttDef = new (fMemoryManager) DTDAttDef(fMemoryManager);
00432             fDumAttDef->setId(fNextAttrId++);
00433         }
00434         fDumAttDef->setName(bufToUse.getRawBuffer());
00435         decl = fDumAttDef;
00436     }
00437      else
00438     {
00439         //
00440         //  It does not already exist so create a new one, give it the next
00441         //  available unique id, and add it
00442         //
00443         decl = new (fGrammarPoolMemoryManager) DTDAttDef
00444         (
00445             bufToUse.getRawBuffer()
00446             , XMLAttDef::CData
00447             , XMLAttDef::Implied
00448             , fGrammarPoolMemoryManager
00449         );
00450         decl->setId(fNextAttrId++);
00451         decl->setExternalAttDeclaration(isReadingExternalEntity());
00452         parentElem.addAttDef(decl);
00453     }
00454 
00455     // Set a flag to indicate whether we are doing a dummy parse
00456     const bool isIgnored = (decl == fDumAttDef);
00457 
00458     // Space is required here, so check for PE ref, and require space
00459     if (!checkForPERef(false, true))
00460         fScanner->emitError(XMLErrs::ExpectedWhitespace);
00461 
00462     //
00463     //  Next has to be one of the attribute type strings. This tells us what
00464     //  is to follow.
00465     //
00466     if (fReaderMgr->skippedString(XMLUni::fgCDATAString))
00467     {
00468         decl->setType(XMLAttDef::CData);
00469     }
00470      else if (fReaderMgr->skippedString(XMLUni::fgIDString))
00471     {
00472         if (!fReaderMgr->skippedString(XMLUni::fgRefString))
00473             decl->setType(XMLAttDef::ID);
00474         else if (!fReaderMgr->skippedChar(chLatin_S))
00475             decl->setType(XMLAttDef::IDRef);
00476         else
00477             decl->setType(XMLAttDef::IDRefs);
00478     }
00479      else if (fReaderMgr->skippedString(XMLUni::fgEntitString))
00480     {
00481         if (fReaderMgr->skippedChar(chLatin_Y))
00482         {
00483             decl->setType(XMLAttDef::Entity);
00484         }
00485          else if (fReaderMgr->skippedString(XMLUni::fgIESString))
00486         {
00487             decl->setType(XMLAttDef::Entities);
00488         }
00489          else
00490         {
00491             fScanner->emitError
00492             (
00493                 XMLErrs::ExpectedAttributeType
00494                 , decl->getFullName()
00495                 , parentElem.getFullName()
00496             );
00497             return 0;
00498         }
00499     }
00500      else if (fReaderMgr->skippedString(XMLUni::fgNmTokenString))
00501     {
00502         if (fReaderMgr->skippedChar(chLatin_S))
00503             decl->setType(XMLAttDef::NmTokens);
00504         else
00505             decl->setType(XMLAttDef::NmToken);
00506     }
00507      else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
00508     {
00509         // Check for PE ref and require space
00510         if (!checkForPERef(false, true))
00511             fScanner->emitError(XMLErrs::ExpectedWhitespace);
00512 
00513         decl->setType(XMLAttDef::Notation);
00514         if (!scanEnumeration(*decl, bufToUse, true))
00515             return 0;
00516 
00517         // Set the value as the enumeration for this decl
00518         decl->setEnumeration(bufToUse.getRawBuffer());
00519     }
00520      else if (fReaderMgr->skippedChar(chOpenParen))
00521     {
00522         decl->setType(XMLAttDef::Enumeration);
00523         if (!scanEnumeration(*decl, bufToUse, false))
00524             return 0;
00525 
00526         // Set the value as the enumeration for this decl
00527         decl->setEnumeration(bufToUse.getRawBuffer());
00528     }
00529      else
00530     {
00531         fScanner->emitError
00532         (
00533             XMLErrs::ExpectedAttributeType
00534             , decl->getFullName()
00535             , parentElem.getFullName()
00536         );
00537         return 0;
00538     }
00539 
00540     // Space is required here, so check for PE ref, and require space
00541     if (!checkForPERef(false, true))
00542         fScanner->emitError(XMLErrs::ExpectedWhitespace);
00543 
00544     // And then scan for the optional default value declaration
00545     scanDefaultDecl(*decl);
00546 
00547     // If validating, then do a couple of validation constraints
00548     if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
00549     {
00550         if (decl->getType() == XMLAttDef::ID)
00551         {
00552             if ((decl->getDefaultType() != XMLAttDef::Implied)
00553             &&  (decl->getDefaultType() != XMLAttDef::Required))
00554             {
00555                 fScanner->getValidator()->emitError(XMLValid::BadIDAttrDefType, decl->getFullName());
00556             }
00557         }
00558 
00559         // if attdef is xml:space, check correct enumeration (default|preserve)
00560         const XMLCh fgXMLSpace[] = { chLatin_x, chLatin_m, chLatin_l, chColon, chLatin_s, chLatin_p, chLatin_a, chLatin_c, chLatin_e, chNull };
00561 
00562         if (XMLString::equals(decl->getFullName(),fgXMLSpace)) {
00563             const XMLCh fgPreserve[] = { chLatin_p, chLatin_r, chLatin_e, chLatin_s, chLatin_e, chLatin_r, chLatin_v, chLatin_e, chNull };
00564             const XMLCh fgDefault[] = { chLatin_d, chLatin_e, chLatin_f, chLatin_a, chLatin_u, chLatin_l, chLatin_t, chNull };
00565             bool ok = false;
00566             if (decl->getType() == XMLAttDef::Enumeration) {
00567                 BaseRefVectorOf<XMLCh>* enumVector = XMLString::tokenizeString(decl->getEnumeration(), fMemoryManager);
00568                 XMLSize_t size = enumVector->size();
00569                 ok = (size == 1 &&
00570                      (XMLString::equals(enumVector->elementAt(0), fgDefault) ||
00571                       XMLString::equals(enumVector->elementAt(0), fgPreserve))) ||
00572                      (size == 2 &&
00573                      (XMLString::equals(enumVector->elementAt(0), fgDefault) &&
00574                       XMLString::equals(enumVector->elementAt(1), fgPreserve))) ||
00575                      (size == 2 &&
00576                      (XMLString::equals(enumVector->elementAt(1), fgDefault) &&
00577                       XMLString::equals(enumVector->elementAt(0), fgPreserve)));
00578                 delete enumVector;
00579             }
00580             if (!ok)
00581                 fScanner->getValidator()->emitError(XMLValid::IllegalXMLSpace);
00582         }
00583     }
00584 
00585     // If we have a doc type handler, tell it about this attdef.
00586     if (fDocTypeHandler)
00587         fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
00588     return decl;
00589 }
00590 
00591 
00592 void DTDScanner::scanAttListDecl()
00593 {
00594     // Space is required here, so check for a PE ref
00595     if (!checkForPERef(false, true))
00596     {
00597         fScanner->emitError(XMLErrs::ExpectedWhitespace);
00598         fReaderMgr->skipPastChar(chCloseAngle);
00599         return;
00600     }
00601 
00602     //
00603     //  Next should be the name of the element it belongs to, so get a buffer
00604     //  and get the name into it.
00605     //
00606     XMLBufBid bbName(fBufMgr);
00607     if (!fReaderMgr->getName(bbName.getBuffer()))
00608     {
00609         fScanner->emitError(XMLErrs::ExpectedElementName);
00610         fReaderMgr->skipPastChar(chCloseAngle);
00611         return;
00612     }
00613 
00614     //
00615     //  Find this element's declaration. If it has not been declared yet,
00616     //  we will force one into the list, but not mark it as declared.
00617     //
00618     DTDElementDecl* elemDecl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
00619     if (!elemDecl)
00620     {
00621         //
00622         //  Lets fault in a declaration and add it to the pool. We mark
00623         //  it having been created because of an attlist. Later, if its
00624         //  declared, this will be updated.
00625         //
00626         elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
00627         (
00628             bbName.getRawBuffer()
00629             , fEmptyNamespaceId
00630             , DTDElementDecl::Any
00631             , fGrammarPoolMemoryManager
00632         );
00633         elemDecl->setCreateReason(XMLElementDecl::AttList);
00634         elemDecl->setExternalElemDeclaration(isReadingExternalEntity());
00635         fDTDGrammar->putElemDecl((XMLElementDecl*) elemDecl);
00636     }
00637 
00638     // If we have a doc type handler, tell it the att list is starting
00639     if (fDocTypeHandler)
00640         fDocTypeHandler->startAttList(*elemDecl);
00641 
00642     //
00643     //  Now we loop until we are done with all of the attributes in this
00644     //  list. We need a buffer to use for local processing.
00645     //
00646     XMLBufBid   bbTmp(fBufMgr);
00647     XMLBuffer&  tmpBuf = bbTmp.getBuffer();
00648     bool        seenAnId = false;
00649     while (true)
00650     {
00651         // Get the next char out and see what it tells us to do
00652         const XMLCh nextCh = fReaderMgr->peekNextChar();
00653 
00654         // Watch for EOF
00655         if (!nextCh)
00656             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
00657 
00658         if (nextCh == chCloseAngle)
00659         {
00660             // We are done with this attribute list
00661             fReaderMgr->getNextChar();
00662             break;
00663         }
00664          else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
00665         {
00666             //
00667             //  If advanced callbacks are enabled and we have a doc
00668             //  type handler, then gather up the white space and call
00669             //  back on the doctype handler. Otherwise, just skip
00670             //  whitespace.
00671             //
00672             if (fDocTypeHandler)
00673             {
00674                 fReaderMgr->getSpaces(tmpBuf);
00675                 fDocTypeHandler->doctypeWhitespace
00676                 (
00677                     tmpBuf.getRawBuffer()
00678                     , tmpBuf.getLen()
00679                 );
00680             }
00681              else
00682             {
00683                 fReaderMgr->skipPastSpaces();
00684             }
00685         }
00686          else if (nextCh == chPercent)
00687         {
00688             // Eat the percent and expand the ref
00689             fReaderMgr->getNextChar();
00690             expandPERef(false, false, true);
00691         }
00692          else
00693         {
00694             //
00695             //  It must be an attribute name, so scan it. We let
00696             //  it use our local buffer for its name scanning.
00697             //
00698             XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
00699 
00700             if (!attDef)
00701             {
00702                 fReaderMgr->skipPastChar(chCloseAngle);
00703                 break;
00704             }
00705 
00706             //
00707             //  If we are validating and its an ID type, then we have to
00708             //  make sure that we have not seen an id attribute yet. Set
00709             //  the flag to say that we've seen one now also.
00710             //
00711             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
00712             {
00713                 if (attDef->getType() == XMLAttDef::ID)
00714                 {
00715                     if (seenAnId)
00716                         fScanner->getValidator()->emitError(XMLValid::MultipleIdAttrs, elemDecl->getFullName());
00717                     seenAnId = true;
00718                 }
00719             }
00720         }
00721     }
00722 
00723     // If we have a doc type handler, tell it the att list is ending
00724     if (fDocTypeHandler)
00725         fDocTypeHandler->endAttList(*elemDecl);
00726 }
00727 
00728 
00729 //
00730 //  This method is called to scan the value of an attribute in content. This
00731 //  involves some normalization and replacement of general entity and
00732 //  character references.
00733 //
00734 //  End of entity's must be dealt with here. During DTD scan, they can come
00735 //  from external entities. During content, they can come from any entity.
00736 //  We just eat the end of entity and continue with our scan until we come
00737 //  to the closing quote. If an unterminated value causes us to go through
00738 //  subsequent entities, that will cause errors back in the calling code,
00739 //  but there's little we can do about it here.
00740 //
00741 bool DTDScanner::scanAttValue(const   XMLCh* const        attrName
00742                                 ,       XMLBuffer&          toFill
00743                                 , const XMLAttDef::AttTypes type)
00744 {
00745     enum States
00746     {
00747         InWhitespace
00748         , InContent
00749     };
00750 
00751     // Reset the target buffer
00752     toFill.reset();
00753 
00754     // Get the next char which must be a single or double quote
00755     XMLCh quoteCh;
00756     if (!fReaderMgr->skipIfQuote(quoteCh))
00757         return false;
00758 
00759     //
00760     //  We have to get the current reader because we have to ignore closing
00761     //  quotes until we hit the same reader again.
00762     //
00763     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
00764 
00765     //
00766     //  Loop until we get the attribute value. Note that we use a double
00767     //  loop here to avoid the setup/teardown overhead of the exception
00768     //  handler on every round.
00769     //
00770     XMLCh   nextCh;
00771     XMLCh   secondCh = 0;
00772     States  curState = InContent;
00773     bool    firstNonWS = false;
00774     bool    gotLeadingSurrogate = false;
00775     bool    escaped;
00776     while (true)
00777     {
00778     try
00779     {
00780         while(true)
00781         {
00782             nextCh = fReaderMgr->getNextChar();
00783 
00784             if (!nextCh)
00785                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
00786 
00787             // Check for our ending quote in the same entity
00788             if (nextCh == quoteCh)
00789             {
00790                 if (curReader == fReaderMgr->getCurrentReaderNum())
00791                     return true;
00792 
00793                 // Watch for spillover into a previous entity
00794                 if (curReader > fReaderMgr->getCurrentReaderNum())
00795                 {
00796                     fScanner->emitError(XMLErrs::PartialMarkupInEntity);
00797                     return false;
00798                 }
00799             }
00800 
00801             //
00802             //  Check for an entity ref now, before we let it affect our
00803             //  whitespace normalization logic below. We ignore the empty flag
00804             //  in this one.
00805             //
00806             escaped = false;
00807             if (nextCh == chAmpersand)
00808             {
00809                 if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
00810                 {
00811                     gotLeadingSurrogate = false;
00812                     continue;
00813                 }
00814             }
00815             else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
00816             {
00817                 // Check for correct surrogate pairs
00818                 if (gotLeadingSurrogate)
00819                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
00820                 else
00821                     gotLeadingSurrogate = true;
00822             }
00823              else
00824             {
00825                 if (gotLeadingSurrogate)
00826                 {
00827                     if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
00828                         fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
00829                 }
00830                 // Its got to at least be a valid XML character
00831                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
00832                 {
00833                     XMLCh tmpBuf[9];
00834                     XMLString::binToText
00835                     (
00836                         nextCh
00837                         , tmpBuf
00838                         , 8
00839                         , 16
00840                         , fMemoryManager
00841                     );
00842                     fScanner->emitError
00843                     (
00844                         XMLErrs::InvalidCharacterInAttrValue
00845                         , attrName
00846                         , tmpBuf
00847                     );
00848                 }
00849 
00850                 gotLeadingSurrogate = false;
00851             }
00852 
00853             //
00854             //  If its not escaped, then make sure its not a < character, which
00855             //  is not allowed in attribute values.
00856             //
00857             if (!escaped && (nextCh == chOpenAngle))
00858                 fScanner->emitError(XMLErrs::BracketInAttrValue, attrName);
00859 
00860             //
00861             //  If the attribute is a CDATA type we do simple replacement of
00862             //  tabs and new lines with spaces, if the character is not escaped
00863             //  by way of a char ref.
00864             //
00865             //  Otherwise, we do the standard non-CDATA normalization of
00866             //  compressing whitespace to single spaces and getting rid of
00867             //  leading and trailing whitespace.
00868             //
00869             if (type == XMLAttDef::CData)
00870             {
00871                 if (!escaped)
00872                 {
00873                     if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
00874                         nextCh = chSpace;
00875                 }
00876             }
00877              else
00878             {
00879                 if (curState == InWhitespace)
00880                 {
00881                     if (!fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
00882                     {
00883                         if (firstNonWS)
00884                             toFill.append(chSpace);
00885                         curState = InContent;
00886                         firstNonWS = true;
00887                     }
00888                      else
00889                     {
00890                         continue;
00891                     }
00892                 }
00893                  else if (curState == InContent)
00894                 {
00895                     if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
00896                     {
00897                         curState = InWhitespace;
00898                         continue;
00899                     }
00900                     firstNonWS = true;
00901                 }
00902             }
00903 
00904             // Else add it to the buffer
00905             toFill.append(nextCh);
00906 
00907             if (secondCh)
00908             {
00909                 toFill.append(secondCh);
00910                 secondCh=0;
00911             }
00912         }
00913     }
00914 
00915     catch(const EndOfEntityException&)
00916     {
00917         // Just eat it and continue.
00918         gotLeadingSurrogate = false;
00919         escaped = false;
00920     }
00921     }
00922     return true;
00923 }
00924 
00925 
00926 bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second)
00927 {
00928     bool gotOne = false;
00929     unsigned int value = 0;
00930 
00931     //
00932     //  Set the radix. Its supposed to be a lower case x if hex. But, in
00933     //  order to recover well, we check for an upper and put out an error
00934     //  for that.
00935     //
00936     unsigned int radix = 10;
00937 
00938     if (fReaderMgr->skippedChar(chLatin_x))
00939     {
00940         radix = 16;
00941     }
00942      else if (fReaderMgr->skippedChar(chLatin_X))
00943     {
00944         fScanner->emitError(XMLErrs::HexRadixMustBeLowerCase);
00945         radix = 16;
00946     }
00947 
00948     while (true)
00949     {
00950         const XMLCh nextCh = fReaderMgr->peekNextChar();
00951 
00952         // Watch for EOF
00953         if (!nextCh)
00954             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
00955 
00956         // Break out on the terminating semicolon
00957         if (nextCh == chSemiColon)
00958         {
00959             fReaderMgr->getNextChar();
00960             break;
00961         }
00962 
00963         //
00964         //  Convert this char to a binary value, or bail out if its not
00965         //  one.
00966         //
00967         unsigned int nextVal;
00968         if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
00969             nextVal = (unsigned int)(nextCh - chDigit_0);
00970         else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
00971             nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
00972         else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
00973             nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
00974         else
00975         {
00976             //
00977             //  If we got at least a sigit, then do an unterminated ref
00978             //  error. Else, do an expected a numerical ref thing.
00979             //
00980             if (gotOne)
00981                 fScanner->emitError(XMLErrs::UnterminatedCharRef);
00982             else
00983                 fScanner->emitError(XMLErrs::ExpectedNumericalCharRef);
00984 
00985             return false;
00986         }
00987 
00988         //
00989         //  Make sure its valid for the radix. If not, then just eat the
00990         //  digit and go on after issueing an error. Else, update the
00991         //  running value with this new digit.
00992         //
00993         if (nextVal >= radix)
00994         {
00995             XMLCh tmpStr[2];
00996             tmpStr[0] = nextCh;
00997             tmpStr[1] = chNull;
00998             fScanner->emitError(XMLErrs::BadDigitForRadix, tmpStr);
00999         }
01000          else
01001         {
01002             value = (value * radix) + nextVal;
01003         }
01004 
01005         // Indicate that we got at least one good digit
01006         gotOne = true;
01007 
01008         // Eat the char we just processed
01009         fReaderMgr->getNextChar();
01010     }
01011 
01012     // Return the char (or chars)
01013     // And check if the character expanded is valid or not
01014     if (value >= 0x10000 && value <= 0x10FFFF)
01015     {
01016         value -= 0x10000;
01017         first  = XMLCh((value >> 10) + 0xD800);
01018         second = XMLCh((value & 0x3FF) + 0xDC00);
01019     }
01020     else if (value <= 0xFFFD)
01021     {
01022         first  = XMLCh(value);
01023         second = 0;
01024         if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) {
01025             // Character reference was not in the valid range
01026             fScanner->emitError(XMLErrs::InvalidCharacterRef);
01027             return false;
01028         }
01029     }
01030     else {
01031         // Character reference was not in the valid range
01032         fScanner->emitError(XMLErrs::InvalidCharacterRef);
01033         return false;
01034     }
01035 
01036     return true;
01037 }
01038 
01039 
01040 ContentSpecNode*
01041 DTDScanner::scanChildren(const DTDElementDecl& elemDecl, XMLBuffer& bufToUse)
01042 {
01043     // Check for a PE ref here, but don't require spaces
01044     checkForPERef(false, true);
01045 
01046     ValueStackOf<XMLSize_t>* arrNestedDecl=NULL;
01047     //
01048     //  We know that the caller just saw an opening parenthesis, so we need
01049     //  to parse until we hit the end of it; if we find several parenthesis,
01050     //  store them in an array to be processed later.
01051     //
01052     //  We have to check for one up front, since it could be something like
01053     //  (((a)*)) etc...
01054     //
01055     ContentSpecNode* curNode = 0;
01056     while(fReaderMgr->skippedChar(chOpenParen))
01057     {
01058         // to check entity nesting
01059         const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
01060         if(arrNestedDecl==NULL)
01061             arrNestedDecl=new (fMemoryManager) ValueStackOf<XMLSize_t>(5, fMemoryManager);
01062         arrNestedDecl->push(curReader);
01063 
01064         // Check for a PE ref here, but don't require spaces
01065         checkForPERef(false, true);
01066     }
01067 
01068     // We must find a leaf node here, either standalone or nested in the parenthesis
01069     if (!fReaderMgr->getName(bufToUse))
01070     {
01071         fScanner->emitError(XMLErrs::ExpectedElementName);
01072         return 0;
01073     }
01074 
01075     //
01076     //  Create a leaf node for it. If we can find the element id for
01077     //  this element, then use it. Else, we have to fault in an element
01078     //  decl, marked as created because of being in a content model.
01079     //
01080     XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
01081     if (!decl)
01082     {
01083         decl = new (fGrammarPoolMemoryManager) DTDElementDecl
01084         (
01085             bufToUse.getRawBuffer()
01086             , fEmptyNamespaceId
01087             , DTDElementDecl::Any
01088             , fGrammarPoolMemoryManager
01089         );
01090         decl->setCreateReason(XMLElementDecl::InContentModel);
01091         decl->setExternalElemDeclaration(isReadingExternalEntity());
01092         fDTDGrammar->putElemDecl(decl);
01093     }
01094     curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
01095     (
01096         decl->getElementName()
01097         , fGrammarPoolMemoryManager
01098     );
01099 
01100     // Check for a PE ref here, but don't require spaces
01101     const bool gotSpaces = checkForPERef(false, true);
01102 
01103     // Check for a repetition character after the leaf
01104     XMLCh repCh = fReaderMgr->peekNextChar();
01105     ContentSpecNode* tmpNode = makeRepNode(repCh, curNode, fGrammarPoolMemoryManager);
01106     if (tmpNode != curNode)
01107     {
01108         if (gotSpaces)
01109         {
01110             if (fScanner->emitErrorWillThrowException(XMLErrs::UnexpectedWhitespace))
01111             {
01112                 delete tmpNode;
01113             }
01114             fScanner->emitError(XMLErrs::UnexpectedWhitespace);
01115         }
01116         fReaderMgr->getNextChar();
01117         curNode = tmpNode;
01118     }
01119 
01120     while(arrNestedDecl==NULL || !arrNestedDecl->empty())
01121     {
01122         // Check for a PE ref here, but don't require spaces
01123         checkForPERef(false, true);
01124 
01125         //
01126         //  Ok, the next character tells us what kind of content this particular
01127         //  model this particular parentesized section is. Its either a choice if
01128         //  we see ',', a sequence if we see '|', or a single leaf node if we see
01129         //  a closing paren.
01130         //
01131         const XMLCh opCh = fReaderMgr->peekNextChar();
01132 
01133         if ((opCh != chComma)
01134         &&  (opCh != chPipe)
01135         &&  (opCh != chCloseParen))
01136         {
01137             // Not a legal char, so delete our node and return failure
01138             delete curNode;
01139             fScanner->emitError(XMLErrs::ExpectedSeqChoiceLeaf);
01140             return 0;
01141         }
01142 
01143         //
01144         //  Create the head node of the correct type. We need this to remember
01145         //  the top of the local tree. If it was a single subexpr, then just
01146         //  set the head node to the current node. For the others, we'll build
01147         //  the tree off the second child as we move across.
01148         //
01149         ContentSpecNode* headNode = 0;
01150         ContentSpecNode::NodeTypes curType = ContentSpecNode::UnknownType;
01151         if (opCh == chComma)
01152         {
01153             curType = ContentSpecNode::Sequence;
01154             headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
01155             (
01156                 curType
01157                 , curNode
01158                 , 0
01159                 , true
01160                 , true
01161                 , fGrammarPoolMemoryManager
01162             );
01163             curNode = headNode;
01164         }
01165          else if (opCh == chPipe)
01166         {
01167             curType = ContentSpecNode::Choice;
01168             headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
01169             (
01170                 curType
01171                 , curNode
01172                 , 0
01173                 , true
01174                 , true
01175                 , fGrammarPoolMemoryManager
01176             );
01177             curNode = headNode;
01178         }
01179          else
01180         {
01181             headNode = curNode;
01182             fReaderMgr->getNextChar();
01183         }
01184 
01185         //
01186         //  If it was a sequence or choice, we just loop until we get to the
01187         //  end of our section, adding each new leaf or sub expression to the
01188         //  right child of the current node, and making that new node the current
01189         //  node.
01190         //
01191         if ((opCh == chComma) || (opCh == chPipe))
01192         {
01193             ContentSpecNode* lastNode = 0;
01194             while (true)
01195             {
01196                 //
01197                 //  The next thing must either be another | or , character followed
01198                 //  by another leaf or subexpression, or a closing parenthesis, or a
01199                 //  PE ref.
01200                 //
01201                 if (fReaderMgr->lookingAtChar(chPercent))
01202                 {
01203                     checkForPERef(false, true);
01204                 }
01205                  else if (fReaderMgr->skippedSpace())
01206                 {
01207                     // Just skip whitespace
01208                     fReaderMgr->skipPastSpaces();
01209                 }
01210                  else if (fReaderMgr->skippedChar(chCloseParen))
01211                 {
01212                     //
01213                     //  We've hit the end of this section, so break out. But, we
01214                     //  need to see if we left a partial sequence of choice node
01215                     //  without a second node. If so, we have to undo that and
01216                     //  put its left child into the right node of the previous
01217                     //  node.
01218                     //
01219                     if ((curNode->getType() == ContentSpecNode::Choice)
01220                     ||  (curNode->getType() == ContentSpecNode::Sequence))
01221                     {
01222                         if (!curNode->getSecond())
01223                         {
01224                             ContentSpecNode* saveFirst = curNode->orphanFirst();
01225                             lastNode->setSecond(saveFirst);
01226                             curNode = lastNode;
01227                         }
01228                     }
01229                     break;
01230                 }
01231                  else if (fReaderMgr->skippedChar(opCh))
01232                 {
01233                     // Check for a PE ref here, but don't require spaces
01234                     checkForPERef(false, true);
01235 
01236                     if (fReaderMgr->skippedChar(chOpenParen))
01237                     {
01238                         const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
01239 
01240                         // Recurse to handle this new guy
01241                         ContentSpecNode* subNode;
01242                         try {
01243                             subNode = scanChildren(elemDecl, bufToUse);
01244                         }
01245                         catch (const XMLErrs::Codes)
01246                         {
01247                             delete headNode;
01248                             throw;
01249                         }
01250 
01251                         // If it failed, we are done, clean up here and return failure
01252                         if (!subNode)
01253                         {
01254                             delete headNode;
01255                             return 0;
01256                         }
01257 
01258                         if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
01259                             fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
01260 
01261                         // Else patch it in and make it the new current
01262                         ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
01263                         (
01264                             curType
01265                             , subNode
01266                             , 0
01267                             , true
01268                             , true
01269                             , fGrammarPoolMemoryManager
01270                         );
01271                         curNode->setSecond(newCur);
01272                         lastNode = curNode;
01273                         curNode = newCur;
01274                     }
01275                      else
01276                     {
01277                         //
01278                         //  Got to be a leaf node, so get a name. If we cannot get
01279                         //  one, then clean up and get outa here.
01280                         //
01281                         if (!fReaderMgr->getName(bufToUse))
01282                         {
01283                             delete headNode;
01284                             fScanner->emitError(XMLErrs::ExpectedElementName);
01285                             return 0;
01286                         }
01287 
01288                         //
01289                         //  Create a leaf node for it. If we can find the element
01290                         //  id for this element, then use it. Else, we have to
01291                         //  fault in an element decl, marked as created because
01292                         //  of being in a content model.
01293                         //
01294                         XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bufToUse.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
01295                         if (!decl)
01296                         {
01297                             decl = new (fGrammarPoolMemoryManager) DTDElementDecl
01298                             (
01299                                 bufToUse.getRawBuffer()
01300                                 , fEmptyNamespaceId
01301                                 , DTDElementDecl::Any
01302                                 , fGrammarPoolMemoryManager
01303                             );
01304                             decl->setCreateReason(XMLElementDecl::InContentModel);
01305                             decl->setExternalElemDeclaration(isReadingExternalEntity());
01306                             fDTDGrammar->putElemDecl(decl);
01307                         }
01308 
01309                         ContentSpecNode* tmpLeaf = new (fGrammarPoolMemoryManager) ContentSpecNode
01310                         (
01311                             decl->getElementName()
01312                             , fGrammarPoolMemoryManager
01313                         );
01314 
01315                         // Check for a repetition character after the leaf
01316                         const XMLCh repCh = fReaderMgr->peekNextChar();
01317                         ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf, fGrammarPoolMemoryManager);
01318                         if (tmpLeaf != tmpLeaf2)
01319                             fReaderMgr->getNextChar();
01320 
01321                         //
01322                         //  Create a new sequence or choice node, with the leaf
01323                         //  (or rep surrounding it) we just got as its first node.
01324                         //  Make the new node the second node of the current node,
01325                         //  and then make it the current node.
01326                         //
01327                         ContentSpecNode* newCur = new (fGrammarPoolMemoryManager) ContentSpecNode
01328                         (
01329                             curType
01330                             , tmpLeaf2
01331                             , 0
01332                             , true
01333                             , true
01334                             , fGrammarPoolMemoryManager
01335                         );
01336                         curNode->setSecond(newCur);
01337                         lastNode = curNode;
01338                         curNode = newCur;
01339                     }
01340                 }
01341                  else
01342                 {
01343                     // Cannot be valid
01344                     delete headNode;  // emitError may do a throw so need to clean-up first
01345                     if (opCh == chComma)
01346                     {
01347                         fScanner->emitError(XMLErrs::ExpectedChoiceOrCloseParen);
01348                     }
01349                      else
01350                     {
01351                         fScanner->emitError
01352                         (
01353                             XMLErrs::ExpectedSeqOrCloseParen
01354                             , elemDecl.getFullName()
01355                         );
01356                     }                
01357                     return 0;
01358                 }
01359             }
01360         }
01361 
01362         //
01363         //  We saw the terminating parenthesis so lets check for any repetition
01364         //  character, and create a node for that, making the head node the child
01365         //  of it.
01366         //
01367         const XMLCh repCh = fReaderMgr->peekNextChar();
01368         curNode = makeRepNode(repCh, headNode, fGrammarPoolMemoryManager);
01369         if (curNode != headNode)
01370             fReaderMgr->getNextChar();
01371 
01372         // prepare for recursion
01373         if(arrNestedDecl==NULL)
01374             break;
01375         else
01376         {
01377             // If that failed, no need to go further, return failure
01378             if (!curNode)
01379                 return 0;
01380 
01381             const XMLSize_t curReader = arrNestedDecl->pop();
01382             if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
01383                 fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
01384 
01385             if(arrNestedDecl->empty())
01386             {
01387                 delete arrNestedDecl;
01388                 arrNestedDecl=NULL;
01389             }
01390         }
01391     }
01392 
01393     return curNode;
01394 }
01395 
01396 
01397 //
01398 //  We get here after the '<!--' part of the comment. We scan past the
01399 //  terminating '-->' It will calls the appropriate handler with the comment
01400 //  text, if one is provided. A comment can be in either the document or
01401 //  the DTD, so the fInDocument flag is used to know which handler to send
01402 //  it to.
01403 //
01404 void DTDScanner::scanComment()
01405 {
01406     enum States
01407     {
01408         InText
01409         , OneDash
01410         , TwoDashes
01411     };
01412 
01413     // Get a buffer for this
01414     XMLBufBid bbComment(fBufMgr);
01415 
01416     //
01417     //  Get the comment text into a temp buffer. Be sure to use temp buffer
01418     //  two here, since its to be used for stuff that is potentially longer
01419     //  than just a name.
01420     //
01421     bool   gotLeadingSurrogate = false;
01422     States curState = InText;
01423     while (true)
01424     {
01425         // Get the next character
01426         const XMLCh nextCh = fReaderMgr->getNextChar();
01427 
01428         //  Watch for an end of file
01429         if (!nextCh)
01430         {
01431             fScanner->emitError(XMLErrs::UnterminatedComment);
01432             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
01433         }
01434 
01435         // Check for correct surrogate pairs
01436         if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
01437         {
01438             if (gotLeadingSurrogate)
01439                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
01440             else
01441                 gotLeadingSurrogate = true;
01442         }
01443         else
01444         {
01445             if (gotLeadingSurrogate)
01446             {
01447                 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
01448                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
01449             }
01450             // Its got to at least be a valid XML character
01451             else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
01452 
01453                 XMLCh tmpBuf[9];
01454                 XMLString::binToText
01455                 (
01456                     nextCh
01457                     , tmpBuf
01458                     , 8
01459                     , 16
01460                     , fMemoryManager
01461                 );
01462                 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
01463             }
01464 
01465             gotLeadingSurrogate = false;
01466         }
01467 
01468         if (curState == InText)
01469         {
01470             // If its a dash, go to OneDash state. Otherwise take as text
01471             if (nextCh == chDash)
01472                 curState = OneDash;
01473             else
01474                 bbComment.append(nextCh);
01475         }
01476         else if (curState == OneDash)
01477         {
01478             //
01479             //  If its another dash, then we change to the two dashes states.
01480             //  Otherwise, we have to put in the deficit dash and the new
01481             //  character and go back to InText.
01482             //
01483             if (nextCh == chDash)
01484             {
01485                 curState = TwoDashes;
01486             }
01487             else
01488             {
01489                 bbComment.append(chDash);
01490                 bbComment.append(nextCh);
01491                 curState = InText;
01492             }
01493         }
01494         else if (curState == TwoDashes)
01495         {
01496             // The next character must be the closing bracket
01497             if (nextCh != chCloseAngle)
01498             {
01499                 fScanner->emitError(XMLErrs::IllegalSequenceInComment);
01500                 fReaderMgr->skipPastChar(chCloseAngle);
01501                 return;
01502             }
01503             break;
01504         }
01505     }
01506 
01507     // If there is a doc type handler, then pass on the comment stuff
01508     if (fDocTypeHandler)
01509         fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
01510 }
01511 
01512 
01513 bool DTDScanner::scanContentSpec(DTDElementDecl& toFill)
01514 {
01515     //
01516     //  Check for for a couple of the predefined content type strings. If
01517     //  its not one of these, its got to be a parenthesized reg ex type
01518     //  expression.
01519     //
01520     if (fReaderMgr->skippedString(XMLUni::fgEmptyString))
01521     {
01522         toFill.setModelType(DTDElementDecl::Empty);
01523         return true;
01524     }
01525 
01526     if (fReaderMgr->skippedString(XMLUni::fgAnyString))
01527     {
01528         toFill.setModelType(DTDElementDecl::Any);
01529         return true;
01530     }
01531 
01532     // Its got to be a parenthesized regular expression
01533     if (!fReaderMgr->skippedChar(chOpenParen))
01534     {
01535         fScanner->emitError
01536         (
01537             XMLErrs::ExpectedContentSpecExpr
01538             , toFill.getFullName()
01539         );
01540         return false;
01541     }
01542 
01543     // Get the current reader id, so we can test for partial markup
01544     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
01545 
01546     // We could have a PE ref here, but don't require space
01547     checkForPERef(false, true);
01548 
01549     //
01550     //  Now we look for a PCDATA string. If its PCDATA, then it must be a
01551     //  MIXED model. Otherwise, it must be a regular list of children in
01552     //  a regular expression perhaps.
01553     //
01554     bool status;
01555     if (fReaderMgr->skippedString(XMLUni::fgPCDATAString))
01556     {
01557         // Set the model to mixed
01558         toFill.setModelType(DTDElementDecl::Mixed_Simple);
01559         status = scanMixed(toFill);
01560 
01561         //
01562         //  If we are validating we have to check that there are no multiple
01563         //  uses of any child elements.
01564         //
01565         if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
01566         {
01567             if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
01568                 fScanner->getValidator()->emitError(XMLValid::RepElemInMixed);
01569         }
01570     }
01571      else
01572     {
01573         //
01574         //  We have to do a recursive scan of the content model. Create a
01575         //  buffer for it to use, for efficiency. It returns the top ofthe
01576         //  content spec node tree, which we set if successful.
01577         //
01578         toFill.setModelType(DTDElementDecl::Children);
01579         XMLBufBid bbTmp(fBufMgr);
01580         ContentSpecNode* resNode = scanChildren(toFill, bbTmp.getBuffer());
01581         status = (resNode != 0);
01582         if (status)
01583             toFill.setContentSpec(resNode);
01584     }
01585 
01586     // Make sure we are on the same reader as where we started
01587     if (curReader != fReaderMgr->getCurrentReaderNum() && fScanner->getValidationScheme() == XMLScanner::Val_Always)
01588         fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
01589 
01590     return status;
01591 }
01592 
01593 
01594 void DTDScanner::scanDefaultDecl(DTDAttDef& toFill)
01595 {
01596     if (fReaderMgr->skippedString(XMLUni::fgRequiredString))
01597     {
01598         toFill.setDefaultType(XMLAttDef::Required);
01599         return;
01600     }
01601 
01602     if (fReaderMgr->skippedString(XMLUni::fgImpliedString))
01603     {
01604         toFill.setDefaultType(XMLAttDef::Implied);
01605         return;
01606     }
01607 
01608     if (fReaderMgr->skippedString(XMLUni::fgFixedString))
01609     {
01610         //
01611         //  There must be space before the fixed value. If there is not, then
01612         //  emit an error but keep going.
01613         //
01614         if (!fReaderMgr->skippedSpace())
01615             fScanner->emitError(XMLErrs::ExpectedWhitespace);
01616         else
01617             fReaderMgr->skipPastSpaces();
01618         toFill.setDefaultType(XMLAttDef::Fixed);
01619     }
01620      else
01621     {
01622         toFill.setDefaultType(XMLAttDef::Default);
01623     }
01624 
01625     //
01626     //  If we got here, its fixed or default, so we need to get a value.
01627     //  If we don't, then emit an error but just set the default value to
01628     //  an empty string and try to keep going.
01629     //
01630     // Check for PE ref or optional whitespace
01631     checkForPERef(false, true);
01632 
01633     XMLBufBid bbValue(fBufMgr);
01634     if (!scanAttValue(toFill.getFullName(), bbValue.getBuffer(), toFill.getType()))
01635         fScanner->emitError(XMLErrs::ExpectedDefAttrDecl);
01636 
01637     toFill.setValue(bbValue.getRawBuffer());
01638 }
01639 
01640 
01641 //
01642 //  This is called after seeing '<!ELEMENT' which indicates that an element
01643 //  markup is starting. This guy scans the rest of it and adds it to the
01644 //  element decl pool if it has not already been declared.
01645 //
01646 void DTDScanner::scanElementDecl()
01647 {
01648     //
01649     //  Space is legal (required actually) here so check for a PE ref. If
01650     //  we don't get our whitespace, then issue and error, but try to keep
01651     //  going.
01652     //
01653     if (!checkForPERef(false, true))
01654         fScanner->emitError(XMLErrs::ExpectedWhitespace);
01655 
01656     // Get a buffer for the element name and scan in the name
01657     XMLBufBid bbName(fBufMgr);
01658     if (!fReaderMgr->getName(bbName.getBuffer()))
01659     {
01660         fScanner->emitError(XMLErrs::ExpectedElementName);
01661         fReaderMgr->skipPastChar(chCloseAngle);
01662         return;
01663     }
01664 
01665     // Look this guy up in the element decl pool
01666     DTDElementDecl* decl = (DTDElementDecl*) fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, bbName.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
01667 
01668     //
01669     //  If it does not exist, then we need to create it. If it does and
01670     //  its marked as declared, then that's an error, but we still need to
01671     //  scan over the content model so use the dummy declaration that the
01672     //  parsing code can fill in.
01673     //
01674     if (decl)
01675     {
01676         if (decl->isDeclared())
01677         {
01678             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
01679                 fScanner->getValidator()->emitError(XMLValid::ElementAlreadyExists, bbName.getRawBuffer());
01680 
01681             if (!fDumElemDecl)
01682                 fDumElemDecl = new (fMemoryManager) DTDElementDecl
01683                 (
01684                     bbName.getRawBuffer()
01685                     , fEmptyNamespaceId
01686                     , DTDElementDecl::Any
01687                     , fMemoryManager
01688                 );
01689             else
01690                 fDumElemDecl->setElementName(bbName.getRawBuffer(),fEmptyNamespaceId);
01691         }
01692     }
01693      else
01694     {
01695         //
01696         //  Create the new empty declaration to fill in and put it into
01697         //  the decl pool.
01698         //
01699         decl = new (fGrammarPoolMemoryManager) DTDElementDecl
01700         (
01701             bbName.getRawBuffer()
01702             , fEmptyNamespaceId
01703             , DTDElementDecl::Any
01704             , fGrammarPoolMemoryManager
01705         );
01706         fDTDGrammar->putElemDecl(decl);
01707     }
01708 
01709     // Set a flag for whether we will ignore this one
01710     const bool isIgnored = (decl == fDumElemDecl);
01711 
01712     // Mark this one if being externally declared
01713     decl->setExternalElemDeclaration(isReadingExternalEntity());
01714 
01715     // Mark this one as being declared
01716     decl->setCreateReason(XMLElementDecl::Declared);
01717 
01718     // Another check for a PE ref, with at least required whitespace
01719     if (!checkForPERef(false, true))
01720         fScanner->emitError(XMLErrs::ExpectedWhitespace);
01721 
01722     // And now scan the content model for this guy.
01723     if (!scanContentSpec(*decl))
01724     {
01725         fReaderMgr->skipPastChar(chCloseAngle);
01726         return;
01727     }
01728 
01729     // Another check for a PE ref, but we don't require whitespace here
01730     checkForPERef(false, true);
01731 
01732     // And we should have the ending angle bracket
01733     if (!fReaderMgr->skippedChar(chCloseAngle))
01734     {
01735         fScanner->emitError(XMLErrs::UnterminatedElementDecl, bbName.getRawBuffer());
01736         fReaderMgr->skipPastChar(chCloseAngle);
01737     }
01738 
01739     //
01740     //  If we have a DTD handler tell it about the new element decl. We
01741     //  tell it if its one that can be ignored, cause its an override of a
01742     //  previously existing decl. If it is being ignored, only call back
01743     //  if advanced callbacks are enabled.
01744     //
01745     if (fDocTypeHandler)
01746         fDocTypeHandler->elementDecl(*decl, isIgnored);
01747 }
01748 
01749 
01750 //
01751 //  This method will process a general or parameter entity reference. The
01752 //  entity name and entity text will be stored in the entity pool. The value
01753 //  of the entity will be scanned for any other parameter entity or char
01754 //  references which will be expanded. So the stored value can only have
01755 //  general entity references when done.
01756 //
01757 void DTDScanner::scanEntityDecl()
01758 {
01759     //
01760     //  Space is required here, but we cannot check for a PE Ref since
01761     //  there could be a legal (no-ref) percent sign here. Since any
01762     //  entity that ended here would be illegal, we just skip spaces
01763     //  and then check for a percent.
01764     //
01765     if (!fReaderMgr->lookingAtSpace())
01766         fScanner->emitError(XMLErrs::ExpectedWhitespace);
01767     else
01768         fReaderMgr->skipPastSpaces();
01769     bool isPEDecl = fReaderMgr->skippedChar(chPercent);
01770 
01771     //
01772     //  If a PE decl, then check if it is followed by a space; if it is so, 
01773     //  eat the percent and check for spaces or a PE ref on the other side of it. 
01774     //  Otherwise, it has to be an entity reference for a general entity.
01775     //
01776     if (isPEDecl)
01777     {
01778         if(!fReaderMgr->getCurrentReader()->isWhitespace(fReaderMgr->peekNextChar()))
01779         {
01780             isPEDecl=false;
01781             while (true)
01782             {
01783                if (!expandPERef(false, false, true, false))
01784                   fScanner->emitError(XMLErrs::ExpectedEntityRefName);
01785                // And skip any more spaces in the expanded value
01786                if (fReaderMgr->skippedSpace())
01787                   fReaderMgr->skipPastSpaces();
01788                if (!fReaderMgr->skippedChar(chPercent))
01789                   break;
01790             }
01791         }
01792         else if (!checkForPERef(false, true))
01793             fScanner->emitError(XMLErrs::ExpectedWhitespace);
01794     }
01795 
01796     //
01797     //  Now lets get a name, which should be the name of the entity. We
01798     //  have to get a buffer for this.
01799     //
01800     XMLBufBid bbName(fBufMgr);
01801     if (!fReaderMgr->getName(bbName.getBuffer()))
01802     {
01803         fScanner->emitError(XMLErrs::ExpectedPEName);
01804         fReaderMgr->skipPastChar(chCloseAngle);
01805         return;
01806     }
01807 
01808     // If namespaces are enabled, then no colons allowed
01809     if (fScanner->getDoNamespaces())
01810     {
01811         if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
01812             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
01813     }
01814 
01815     //
01816     //  See if this entity already exists. If so, then the existing one
01817     //  takes precendence. So we use the local dummy decl to parse into
01818     //  and just ignore the results.
01819     //
01820     DTDEntityDecl* entityDecl;
01821     if (isPEDecl)
01822         entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
01823     else
01824         entityDecl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
01825 
01826     if (entityDecl)
01827     {
01828         if (!fDumEntityDecl)
01829             fDumEntityDecl = new (fMemoryManager) DTDEntityDecl(fMemoryManager);
01830         fDumEntityDecl->setName(bbName.getRawBuffer());
01831         entityDecl = fDumEntityDecl;
01832     }
01833      else
01834     {
01835         // Its not in existence already, then create an entity decl for it
01836         entityDecl = new (fGrammarPoolMemoryManager) DTDEntityDecl(bbName.getRawBuffer(), false, fGrammarPoolMemoryManager);
01837 
01838         //
01839         //  Set the declaration location. The parameter indicates whether its
01840         //  declared in the content/internal subset, so we know whether or not
01841         //  its in the external subset.
01842         //
01843         entityDecl->setDeclaredInIntSubset(fInternalSubset);
01844 
01845         // Add it to the appropriate entity decl pool
01846         if (isPEDecl)
01847             fPEntityDeclPool->put(entityDecl);
01848          else
01849             fDTDGrammar->putEntityDecl(entityDecl);
01850     }
01851 
01852     // Set a flag that indicates whether we are ignoring this one
01853     const bool isIgnored = (entityDecl == fDumEntityDecl);
01854 
01855     // Set the PE flag on it
01856     entityDecl->setIsParameter(isPEDecl);
01857 
01858     //
01859     //  Space is legal (required actually) here so check for a PE ref. If
01860     //  we don't get our whitespace, then issue an error, but try to keep
01861     //  going.
01862     //
01863     if (!checkForPERef(false, true))
01864         fScanner->emitError(XMLErrs::ExpectedWhitespace);
01865 
01866     // save the hasNoDTD status for Entity Constraint Checking
01867     bool hasNoDTD = fScanner->getHasNoDTD();
01868     if (hasNoDTD && isPEDecl)
01869         fScanner->setHasNoDTD(false);
01870 
01871     // According to the type call the value scanning method
01872     if (!scanEntityDef(*entityDecl, isPEDecl))
01873     {
01874         fReaderMgr->skipPastChar(chCloseAngle);
01875         fScanner->setHasNoDTD(true);
01876         fScanner->emitError(XMLErrs::ExpectedEntityValue);
01877         return;
01878     }
01879     if (hasNoDTD)
01880         fScanner->setHasNoDTD(true);
01881 
01882     // Space is legal (but not required) here so check for a PE ref
01883     checkForPERef(false, true);
01884 
01885     // And then we have to have the closing angle bracket
01886     if (!fReaderMgr->skippedChar(chCloseAngle))
01887     {
01888         fScanner->emitError(XMLErrs::UnterminatedEntityDecl, entityDecl->getName());
01889         fReaderMgr->skipPastChar(chCloseAngle);
01890     }
01891 
01892     //
01893     //  If we have a doc type handler, then call it. But only call it for
01894     //  ignored elements if advanced callbacks are enabled.
01895     //
01896     if (fDocTypeHandler)
01897         fDocTypeHandler->entityDecl(*entityDecl, isPEDecl, isIgnored);
01898 }
01899 
01900 
01901 //
01902 //  This method will scan a general/character entity ref. It will either
01903 //  expand a char ref and return the value directly, or it will expand
01904 //  a general entity and a reader for it onto the reader stack.
01905 //
01906 //  The return value indicates whether the value was returned directly or
01907 //  pushed as a reader or it failed.
01908 //
01909 //  The escaped flag tells the caller whether the returnd parameter resulted
01910 //  from a character reference, which escapes the character in some cases. It
01911 //  only makes any difference if the return indicates the value was returned
01912 //  directly.
01913 //
01914 //  NOTE: This is only called when scanning attribute values, so we always
01915 //  expand general entities.
01916 //
01917 DTDScanner::EntityExpRes
01918 DTDScanner::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
01919 {
01920     // Assume no escape and no second char
01921     escaped = false;
01922     secondCh = 0;
01923 
01924     // We have to insure its all done in a single entity
01925     const XMLSize_t curReader = fReaderMgr->getCurrentReaderNum();
01926 
01927     //
01928     //  If the next char is a pound, then its a character reference and we
01929     //  need to expand it always.
01930     //
01931     if (fReaderMgr->skippedChar(chPound))
01932     {
01933         //
01934         //  Its a character reference, so scan it and get back the numeric
01935         //  value it represents. If it fails, just return immediately.
01936         //
01937         if (!scanCharRef(firstCh, secondCh))
01938             return EntityExp_Failed;
01939 
01940         if (curReader != fReaderMgr->getCurrentReaderNum())
01941             fScanner->emitError(XMLErrs::PartialMarkupInEntity);
01942 
01943         // Its now escaped since it was a char ref
01944         escaped = true;
01945         return EntityExp_Returned;
01946     }
01947 
01948     // Get the name of the general entity
01949     XMLBufBid bbName(fBufMgr);
01950     if (!fReaderMgr->getName(bbName.getBuffer()))
01951     {
01952         fScanner->emitError(XMLErrs::ExpectedEntityRefName);
01953         return EntityExp_Failed;
01954     }
01955 
01956     //
01957     //  Next char must be a semi-colon. But if its not, just emit
01958     //  an error and try to continue.
01959     //
01960     if (!fReaderMgr->skippedChar(chSemiColon))
01961         fScanner->emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
01962 
01963     // Make sure it was all in one entity reader
01964     if (curReader != fReaderMgr->getCurrentReaderNum())
01965         fScanner->emitError(XMLErrs::PartialMarkupInEntity);
01966 
01967     // Look it up the name the general entity pool
01968     XMLEntityDecl* decl = fDTDGrammar->getEntityDecl(bbName.getRawBuffer());
01969 
01970     // If it does not exist, then obviously an error
01971     if (!decl)
01972     {
01973         // XML 1.0 Section 4.1
01974         if (fScanner->getStandalone() || fScanner->getHasNoDTD()) {
01975             fScanner->emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
01976         }
01977         else {
01978             if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
01979                 fScanner->getValidator()->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
01980         }
01981 
01982         return EntityExp_Failed;
01983     }
01984 
01985 
01986     //
01987     // XML 1.0 Section 4.1
01988     //  If we are a standalone document, then it has to have been declared
01989     //  in the internal subset.
01990     //
01991     if (fScanner->getStandalone() && !decl->getDeclaredInIntSubset())
01992         fScanner->emitError(XMLErrs::IllegalRefInStandalone, bbName.getRawBuffer());
01993 
01994     //
01995     //  If its a special char reference, then its escaped and we can return
01996     //  it directly.
01997     //
01998     if (decl->getIsSpecialChar())
01999     {
02000         firstCh = decl->getValue()[0];
02001         escaped = true;
02002         return EntityExp_Returned;
02003     }
02004 
02005     if (decl->isExternal())
02006     {
02007         // If its unparsed, then its not valid here
02008         // XML 1.0 Section 4.4.4 the appearance of a reference to an unparsed entity is forbidden.
02009         if (decl->isUnparsed())
02010         {
02011             fScanner->emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
02012             return EntityExp_Failed;
02013         }
02014 
02015         // We are in an attribute value, so not valid.
02016         // XML 1.0 Section 4.4.4 a reference to an external entity in an attribute value is forbidden.
02017         fScanner->emitError(XMLErrs::NoExtRefsInAttValue);
02018 
02019         // And now create a reader to read this entity
02020         InputSource* srcUsed;
02021         XMLReader* reader = fReaderMgr->createReader
02022         (
02023             decl->getBaseURI()
02024             , decl->getSystemId()
02025             , decl->getPublicId()
02026             , false
02027             , XMLReader::RefFrom_NonLiteral
02028             , XMLReader::Type_General
02029             , XMLReader::Source_External
02030             , srcUsed
02031             , fScanner->getCalculateSrcOfs()
02032             , fScanner->getLowWaterMark()
02033             , fScanner->getDisableDefaultEntityResolution()
02034         );
02035 
02036         // Put a janitor on the source so it gets cleaned up on exit
02037         Janitor<InputSource> janSrc(srcUsed);
02038 
02039         //
02040         //  If the creation failed then throw an exception
02041         //
02042         if (!reader)
02043             ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed ? srcUsed->getSystemId() : decl->getSystemId(), fMemoryManager);
02044 
02045         //
02046         //  Push the reader. If its a recursive expansion, then emit an error
02047         //  and return an failure.
02048         //
02049         if (!fReaderMgr->pushReader(reader, decl))
02050         {
02051             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
02052             return EntityExp_Failed;
02053         }
02054 
02055         // If it starts with the XML string, then parse a text decl
02056         if (fScanner->checkXMLDecl(true))
02057             scanTextDecl();
02058     }
02059      else
02060     {
02061         //
02062         //  Create a reader over a memory stream over the entity value
02063         //  We force it to assume UTF-16 by passing in an encoding
02064         //  string. This way it won't both trying to predecode the
02065         //  first line, looking for an XML/TextDecl.
02066         //
02067         XMLReader* valueReader = fReaderMgr->createIntEntReader
02068         (
02069             decl->getName()
02070             , XMLReader::RefFrom_NonLiteral
02071             , XMLReader::Type_General
02072             , decl->getValue()
02073             , decl->getValueLen()
02074             , false
02075         );
02076 
02077         //
02078         //  Trt to push the entity reader onto the reader manager stack,
02079         //  where it will become the subsequent input. If it fails, that
02080         //  means the entity is recursive, so issue an error. The reader
02081         //  will have just been discarded, but we just keep going.
02082         //
02083         if (!fReaderMgr->pushReader(valueReader, decl))
02084             fScanner->emitError(XMLErrs::RecursiveEntity, decl->getName());
02085     }
02086 
02087     return EntityExp_Pushed;
02088 }
02089 
02090 
02091 //
02092 //  This method will scan a quoted literal of an entity value. It has to
02093 //  deal with replacement of PE references; however, since this is a DTD
02094 //  scanner, all such entity literals are in entity decls and therefore
02095 //  general entities are not expanded.
02096 //
02097 bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill)
02098 {
02099     toFill.reset();
02100 
02101     // Get the next char which must be a single or double quote
02102     XMLCh quoteCh;
02103     if (!fReaderMgr->skipIfQuote(quoteCh))
02104         return false;
02105 
02106     // Get a buffer for pulling in entity names when we see GE refs
02107     XMLBufBid bbName(fBufMgr);
02108     XMLBuffer& nameBuf = bbName.getBuffer();
02109 
02110     // Remember the current reader
02111     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
02112 
02113     //
02114     //  Loop until we see the ending quote character, handling any references
02115     //  in the process.
02116     //
02117     XMLCh   nextCh;
02118     XMLCh   secondCh = 0;
02119     bool    gotLeadingSurrogate = false;
02120     while (true)
02121     {
02122         nextCh = fReaderMgr->getNextChar();
02123 
02124         //
02125         //  Watch specifically for EOF and issue a more meaningful error
02126         //  if that occurs (since an unterminated quoted char can cause
02127         //  this easily.)
02128         //
02129         if (!nextCh)
02130         {
02131             fScanner->emitError(XMLErrs::UnterminatedEntityLiteral);
02132             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
02133         }
02134 
02135         //
02136         //  Break out on our terminating quote char when we are back in the
02137         //  same reader. Otherwise, we might trigger on a nested quote char
02138         //  in an expanded entity.
02139         //
02140         if ((nextCh == quoteCh)
02141         &&  (fReaderMgr->getCurrentReaderNum() == orgReader))
02142         {
02143             break;
02144         }
02145 
02146         if (nextCh == chPercent)
02147         {
02148             //
02149             //  Put the PE's value on the reader stack and then jump back
02150             //  to the top to start processing it. The parameter indicates
02151             //  that it should not scan the reference's content as an external
02152             //  subset.
02153             //
02154             expandPERef(false, true, true);
02155             continue;
02156         }
02157 
02158         //
02159         //  Ok, now that all the other special stuff is checked, we can
02160         //  look for a general entity. In here, we cannot have a naked &
02161         //  and will only expand numerical char refs or the intrinsic char
02162         //  refs. Others will be left alone.
02163         //
02164         if (nextCh == chAmpersand)
02165         {
02166             //
02167             //  Here, we only expand numeric char refs, but not any general
02168             //  entities. However, the stupid XML spec requires that we check
02169             //  and make sure it does refer to a general entity if its not
02170             //  a char ref (i.e. no naked '&' chars.)
02171             //
02172             if (fReaderMgr->skippedChar(chPound))
02173             {
02174                 // If it failed, then just jump back to the top and try to pick up
02175                 if (!scanCharRef(nextCh, secondCh))
02176                 {
02177                     gotLeadingSurrogate = false;
02178                     continue;
02179                 }
02180             }
02181              else
02182             {
02183                 if (!fReaderMgr->getName(nameBuf))
02184                 {
02185                     fScanner->emitError(XMLErrs::ExpectedEntityRefName);
02186                 }
02187                  else
02188                 {
02189                     //
02190                     //  Since we are not expanding any of this, we have to
02191                     //  put the amp and name into the target buffer as data.
02192                     //
02193                     toFill.append(chAmpersand);
02194                     toFill.append(nameBuf.getRawBuffer());
02195 
02196                     // Make sure we skipped a trailing semicolon
02197                     if (!fReaderMgr->skippedChar(chSemiColon))
02198                     {
02199                         fScanner->emitError
02200                         (
02201                             XMLErrs::UnterminatedEntityRef
02202                             , nameBuf.getRawBuffer()
02203                         );
02204                     }
02205 
02206                     // And make the new character the semicolon
02207                     nextCh = chSemiColon;
02208                 }
02209 
02210                 // Either way here we reset the surrogate flag
02211                 gotLeadingSurrogate = false;
02212             }
02213         }
02214         else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
02215         {
02216             if (gotLeadingSurrogate)
02217                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
02218             else
02219                 gotLeadingSurrogate = true;
02220         }
02221          else
02222         {
02223             if (gotLeadingSurrogate)
02224             {
02225                 if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
02226                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
02227             }
02228              else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
02229             {
02230                 XMLCh tmpBuf[9];
02231                 XMLString::binToText
02232                 (
02233                     nextCh
02234                     , tmpBuf
02235                     , 8
02236                     , 16
02237                     , fMemoryManager
02238                 );
02239                 fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
02240                 fReaderMgr->skipPastChar(quoteCh);
02241                 return false;
02242             }
02243             gotLeadingSurrogate = false;
02244         }
02245 
02246         // Looks ok, so add it to the literal
02247         toFill.append(nextCh);
02248 
02249         if (secondCh)
02250         {
02251             toFill.append(secondCh);
02252             secondCh=0;
02253         }
02254     }
02255 
02256     //
02257     //  If we got here and did not get back to the original reader level,
02258     //  then we propogated some entity out of the literal, so issue an
02259     //  error, but don't fail.
02260     //
02261     if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
02262         fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
02263 
02264     return true;
02265 }
02266 
02267 
02268 //
02269 //  This method is called after the entity name has been scanned, and any
02270 //  PE referenced following the name is handled. The passed decl will be
02271 //  filled in with the info scanned.
02272 //
02273 bool DTDScanner::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
02274 {
02275     // Its got to be an entity literal
02276     if (fReaderMgr->lookingAtChar(chSingleQuote)
02277     ||  fReaderMgr->lookingAtChar(chDoubleQuote))
02278     {
02279         // Get a buffer for the literal
02280         XMLBufBid bbValue(fBufMgr);
02281 
02282         if (!scanEntityLiteral(bbValue.getBuffer()))
02283             return false;
02284 
02285         // Set it on the entity decl
02286         decl.setValue(bbValue.getRawBuffer());
02287         return true;
02288     }
02289 
02290     //
02291     //  Its got to be an external entity, so there must be an external id.
02292     //  Get buffers for them and scan an external id into them.
02293     //
02294     XMLBufBid bbPubId(fBufMgr);
02295     XMLBufBid bbSysId(fBufMgr);
02296     if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
02297         return false;
02298 
02299     decl.setIsExternal(true);
02300     ReaderMgr::LastExtEntityInfo lastInfo;
02301     fReaderMgr->getLastExtEntityInfo(lastInfo);
02302 
02303     // Fill in the id fields of the decl with the info we got
02304     const XMLCh* publicId = bbPubId.getRawBuffer();
02305     const XMLCh* systemId = bbSysId.getRawBuffer();
02306     decl.setPublicId((publicId && *publicId) ? publicId : 0);
02307     decl.setSystemId((systemId && *systemId) ? systemId : 0);
02308     decl.setBaseURI((lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0);
02309 
02310     // If its a PE decl, we are done
02311     bool gotSpaces = checkForPERef(false, true);
02312     if (isPEDecl)
02313     {
02314         //
02315         //  Check for a common error here. NDATA is not allowed for PEs
02316         //  so check for the NDATA string. If found give a nice meaningful
02317         //  error and continue parsing to eat the NDATA text.
02318         //
02319         if (gotSpaces)
02320         {
02321             if (fReaderMgr->skippedString(XMLUni::fgNDATAString))
02322                 fScanner->emitError(XMLErrs::NDATANotValidForPE);
02323         }
02324          else
02325         {
02326             return true;
02327         }
02328     }
02329 
02330     // If looking at close angle now, we are done
02331     if (fReaderMgr->lookingAtChar(chCloseAngle))
02332         return true;
02333 
02334     // Else we had to have seem the whitespace
02335     if (!gotSpaces)
02336         fScanner->emitError(XMLErrs::ExpectedWhitespace);
02337 
02338     // We now have to see a notation data string
02339     if (!fReaderMgr->skippedString(XMLUni::fgNDATAString))
02340         fScanner->emitError(XMLErrs::ExpectedNDATA);
02341 
02342     // Space is required here, but try to go on if not
02343     if (!checkForPERef(false, true))
02344         fScanner->emitError(XMLErrs::ExpectedWhitespace);
02345 
02346     // Get a name
02347     XMLBufBid bbName(fBufMgr);
02348     if (!fReaderMgr->getName(bbName.getBuffer()))
02349     {
02350         fScanner->emitError(XMLErrs::ExpectedNotationName);
02351         return false;
02352     }
02353 
02354     // Set the decl's notation name
02355     decl.setNotationName(bbName.getRawBuffer());
02356 
02357     return true;
02358 }
02359 
02360 
02361 //
02362 //  This method is called after an attribute decl name or a notation decl has
02363 //  been scanned and then an opening parenthesis was see, indicating the list
02364 //  of values. It scans the enumeration values and creates a single string
02365 //  which has a single space between each value.
02366 //
02367 //  The terminating close paren ends this scan.
02368 //
02369 bool DTDScanner::scanEnumeration( const   DTDAttDef&  attDef
02370                                     ,       XMLBuffer&  toFill
02371                                     , const bool        notation)
02372 {
02373     // Reset the passed buffer
02374     toFill.reset();
02375 
02376     // Check for PE ref but don't require space
02377     checkForPERef(false, true);
02378 
02379     // If this is a notation, we need an opening paren
02380     if (notation)
02381     {
02382         if (!fReaderMgr->skippedChar(chOpenParen))
02383             fScanner->emitError(XMLErrs::ExpectedOpenParen);
02384     }
02385 
02386     // We need a local buffer to use as well
02387     XMLBufBid bbTmp(fBufMgr);
02388 
02389     while (true)
02390     {
02391         // Space is allowed here for either type so check for PE ref
02392         checkForPERef(false, true);
02393 
02394         // And then get either a name or a name token
02395         bool success;
02396         if (notation)
02397             success = fReaderMgr->getName(bbTmp.getBuffer());
02398         else
02399             success = fReaderMgr->getNameToken(bbTmp.getBuffer());
02400 
02401         if (!success)
02402         {
02403             fScanner->emitError
02404             (
02405                 XMLErrs::ExpectedEnumValue
02406                 , attDef.getFullName()
02407             );
02408             return false;
02409         }
02410 
02411         // Append this value to the target value
02412         toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
02413 
02414         // Space is allowed here for either type so check for PE ref
02415         checkForPERef(false, true);
02416 
02417         // Check for the terminating paren
02418         if (fReaderMgr->skippedChar(chCloseParen))
02419             break;
02420 
02421         // And append a space separator
02422         toFill.append(chSpace);
02423 
02424         // Check for the pipe character separator
02425         if (!fReaderMgr->skippedChar(chPipe))
02426         {
02427             fScanner->emitError(XMLErrs::ExpectedEnumSepOrParen);
02428             return false;
02429         }
02430     }
02431     return true;
02432 }
02433 
02434 
02435 bool DTDScanner::scanEq()
02436 {
02437     fReaderMgr->skipPastSpaces();
02438     if (fReaderMgr->skippedChar(chEqual))
02439     {
02440         fReaderMgr->skipPastSpaces();
02441         return true;
02442     }
02443     return false;
02444 }
02445 
02446 
02447 //
02448 //  This method is called when an external entity reference is seen in the
02449 //  DTD or an external DTD subset is encountered, and their contents pushed
02450 //  onto the reader stack. This method will scan that contents.
02451 //
02452 void DTDScanner::scanExtSubsetDecl(const bool inIncludeSect, const bool isDTD)
02453 {
02454     // Indicate we are in the external subset now
02455     FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
02456 
02457 
02458     bool bAcceptDecl = !inIncludeSect;
02459 
02460     // Get a buffer for whitespace
02461     XMLBufBid bbSpace(fBufMgr);
02462 
02463     //
02464     //  If we have a doc type handler and we are not being called recursively
02465     //  to handle an include section, tell it the ext subset starts
02466     //
02467     if (fDocTypeHandler && isDTD && !inIncludeSect)
02468         fDocTypeHandler->startExtSubset();
02469 
02470     //
02471     //  We have to play a trick here if the current entity we are parsing
02472     //  is a PE. Because the spooling code will put out a whitespace before
02473     //  and after an expanded PE if its being scanned outside the context of
02474     //  a literal entity, this will confuse this external subset code.
02475     //
02476     //  So, we see if that is what is happening and, if so, eat the single
02477     //  space, a check for the <?xml string. If we find it, we parse that
02478     //  markup right now and put the space back.
02479     //
02480     if (fReaderMgr->isScanningPERefOutOfLiteral())
02481     {
02482         if (fReaderMgr->skippedSpace())
02483         {
02484             if (fScanner->checkXMLDecl(true))
02485             {
02486                 scanTextDecl();
02487                 bAcceptDecl = false;
02488 
02489                 // <TBD> Figure out how to do this
02490                 // fReaderMgr->unGet(chSpace);
02491             }
02492         }
02493     }
02494 
02495     // Get the current reader number
02496     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
02497 
02498     //
02499     //  Loop until we hit the end of the external subset entity. Note that
02500     //  we use a double loop here in order to avoid the overhead of doing
02501     //  the exception setup/teardown work on every loop.
02502     //
02503     bool inMarkup = false;
02504     bool inCharData = false;
02505     while (true)
02506     {
02507         bool bDoBreak=false;    // workaround for Borland bug with 'break' in 'catch'
02508         try
02509         {
02510             while (true)
02511             {
02512                 const XMLCh nextCh = fReaderMgr->peekNextChar();
02513 
02514                 if (!nextCh)
02515                 {
02516                     return; // nothing left
02517                 }
02518                 else if (nextCh == chOpenAngle)
02519                 {
02520                     // Get the reader we started this on
02521                     // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
02522                     const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
02523                     bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
02524 
02525                     //
02526                     //  Now scan the markup. Set the flag so that we will know that
02527                     //  we were in markup if an end of entity exception occurs.
02528                     //
02529                     fReaderMgr->getNextChar();
02530                     inMarkup = true;
02531                     scanMarkupDecl(bAcceptDecl);
02532                     inMarkup = false;
02533 
02534                     //
02535                     //  And see if we got back to the same level. If not, then its
02536                     //  a partial markup error.
02537                     //
02538                     if (fReaderMgr->getCurrentReaderNum() != orgReader){
02539                         if (wasInPE)
02540                             fScanner->emitError(XMLErrs::PEBetweenDecl);
02541                         else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
02542                             fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
02543                     }
02544 
02545                 }
02546                 else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
02547                 {
02548                     //
02549                     //  If we have a doc type handler, and advanced callbacks are
02550                     //  enabled, then gather up whitespace and call back. Otherwise
02551                     //  just skip whitespaces.
02552                     //
02553                     if (fDocTypeHandler)
02554                     {
02555                         inCharData = true;
02556                         fReaderMgr->getSpaces(bbSpace.getBuffer());
02557                         inCharData = false;
02558 
02559                         fDocTypeHandler->doctypeWhitespace
02560                         (
02561                             bbSpace.getRawBuffer()
02562                             , bbSpace.getLen()
02563                         );
02564                     }
02565                     else
02566                     {
02567                         //
02568                         //  If we hit an end of entity in the middle of white
02569                         //  space, that's fine. We'll just come back in here
02570                         //  again on the next round and skip some more.
02571                         //
02572                         fReaderMgr->skipPastSpaces();
02573                     }
02574                 }
02575                 else if (nextCh == chPercent)
02576                 {
02577                     //
02578                     //  Expand (and scan if external) the reference value. Tell
02579                     //  it to throw an end of entity exception at the end of the
02580                     //  entity.
02581                     //
02582                     fReaderMgr->getNextChar();
02583                     expandPERef(true, false, false, true);
02584                 }
02585                 else if (inIncludeSect && (nextCh == chCloseSquare))
02586                 {
02587                     //
02588                     //  Its the end of a conditional include section. So scan it and
02589                     //  decrement the include depth counter.
02590                     //
02591                     fReaderMgr->getNextChar();
02592                     if (!fReaderMgr->skippedChar(chCloseSquare))
02593                     {
02594                         fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
02595                         fReaderMgr->skipPastChar(chCloseAngle);
02596                     }
02597                     else if (!fReaderMgr->skippedChar(chCloseAngle))
02598                     {
02599                         fScanner->emitError(XMLErrs::ExpectedEndOfConditional);
02600                         fReaderMgr->skipPastChar(chCloseAngle);
02601                     }
02602                     return;
02603                 }
02604                 else
02605                 {
02606                     fReaderMgr->getNextChar();
02607                     if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
02608                     {
02609                         XMLCh tmpBuf[9];
02610                         XMLString::binToText
02611                         (
02612                             nextCh
02613                             , tmpBuf
02614                             , 8
02615                             , 16
02616                             , fMemoryManager
02617                         );
02618                         fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
02619                     }
02620                     else
02621                     {
02622                         fScanner->emitError(XMLErrs::InvalidDocumentStructure);
02623                     }
02624 
02625                     // Try to get realigned
02626                     static const XMLCh toSkip[] =
02627                     {
02628                         chPercent, chCloseSquare, chOpenAngle, chNull
02629                     };
02630                     fReaderMgr->skipUntilInOrWS(toSkip);
02631                 }
02632                 bAcceptDecl = false;
02633             }
02634         }
02635         catch(const EndOfEntityException& toCatch)
02636         {
02637             //
02638             //  If the external entity ended while we were in markup, then that's
02639             //  a partial markup error.
02640             //
02641             if (inMarkup)
02642             {
02643                 fScanner->emitError(XMLErrs::PartialMarkupInEntity);
02644                 inMarkup = false;
02645             }
02646 
02647             // If we were in char data, then send what we got
02648             if (inCharData)
02649             {
02650                 // Send what we got, then rethrow
02651                 if (fDocTypeHandler)
02652                 {
02653                     fDocTypeHandler->doctypeWhitespace
02654                     (
02655                         bbSpace.getRawBuffer()
02656                         , bbSpace.getLen()
02657                     );
02658                 }
02659                 inCharData = false;
02660             }
02661 
02662             //
02663             //  If the entity that just ended was the entity that we started
02664             //  on, then this is the end of the external subset.
02665             //
02666             if (orgReader == toCatch.getReaderNum())
02667                 bDoBreak=true;
02668         }
02669         if(bDoBreak)
02670             break;
02671     }
02672 
02673     // If we have a doc type handler, tell it the ext subset ends
02674     if (fDocTypeHandler && isDTD && !inIncludeSect)
02675         fDocTypeHandler->endExtSubset();
02676 }
02677 
02678 
02679 //
02680 //  This method will scan for an id, either public or external.
02681 //
02682 //
02683 // [75] ExternalID ::= 'SYSTEM' S SystemLiteral
02684 //                     | 'PUBLIC' S PubidLiteral S SystemLiteral
02685 // [83] PublicID ::= 'PUBLIC' S PubidLiteral
02686 //
02687 bool DTDScanner::scanId(          XMLBuffer&  pubIdToFill
02688                             ,       XMLBuffer&  sysIdToFill
02689                             , const IDTypes     whatKind)
02690 {
02691     // Clean out both return buffers
02692     pubIdToFill.reset();
02693     sysIdToFill.reset();
02694 
02695     //
02696     //  Check first for the system id first. If we find it, and system id
02697     //  is one of the legal values, then lets try to scan it.
02698     //
02699     // 'SYSTEM' S SystemLiteral
02700     if (fReaderMgr->skippedString(XMLUni::fgSysIDString))
02701     {
02702         // If they were looking for a public id, then we failed
02703         if (whatKind == IDType_Public)
02704         {
02705             fScanner->emitError(XMLErrs::ExpectedPublicId);
02706             return false;
02707         }
02708 
02709         // We must skip spaces
02710         bool skippedSomething;
02711         fReaderMgr->skipPastSpaces(skippedSomething);
02712         if (!skippedSomething)
02713         {
02714             fScanner->emitError(XMLErrs::ExpectedWhitespace);
02715             return false;
02716         }
02717 
02718         // Get the system literal value
02719         return scanSystemLiteral(sysIdToFill);
02720     }
02721 
02722     // Now scan for public id
02723     // 'PUBLIC' S PubidLiteral S SystemLiteral
02724     //  or
02725     // 'PUBLIC' S PubidLiteral
02726 
02727     // If we don't have any public id string => Error
02728     if (!fReaderMgr->skippedString(XMLUni::fgPubIDString)) {
02729         fScanner->emitError(XMLErrs::ExpectedSystemOrPublicId);
02730         return false;
02731     }
02732 
02733     //
02734     //  So following this we must have whitespace, a public literal, whitespace,
02735     //  and a system literal.
02736     //
02737     bool skippedSomething;
02738     fReaderMgr->skipPastSpaces(skippedSomething);
02739     if (!skippedSomething)
02740     {
02741         fScanner->emitError(XMLErrs::ExpectedWhitespace);
02742 
02743         //
02744         //  Just in case, if they just forgot the whitespace but the next char
02745         //  is a single or double quote, then keep going.
02746         //
02747         const XMLCh chPeek = fReaderMgr->peekNextChar();
02748         if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
02749             return false;
02750     }
02751 
02752     if (!scanPublicLiteral(pubIdToFill))
02753         return false;
02754 
02755     // If they wanted a public id, then this is all
02756     if (whatKind == IDType_Public)
02757         return true;
02758 
02759     // check if there is any space follows
02760     bool hasSpace;
02761     fReaderMgr->skipPastSpaces(hasSpace);
02762 
02763     //
02764     //  In order to recover best here we need to see if
02765     //  the next thing is a quote or not
02766     //
02767     const XMLCh chPeek = fReaderMgr->peekNextChar();
02768     const bool bIsQuote =  ((chPeek == chDoubleQuote)
02769                          || (chPeek == chSingleQuote));
02770 
02771     if (!hasSpace)
02772     {
02773         if (whatKind == IDType_External)
02774         {
02775             //
02776             //  If its an external Id, then we need to see the system id.
02777             //  So, emit the error. But, if the next char is a quote, don't
02778             //  give up since its probably going to work. The user just
02779             //  missed the separating space. Otherwise, fail.
02780             //
02781             fScanner->emitError(XMLErrs::ExpectedWhitespace);
02782             if (!bIsQuote)
02783                 return false;
02784         }
02785          else
02786         {
02787             //
02788             //  We can legally return here. But, if the next char is a quote,
02789             //  then that's probably not what was desired, since its probably
02790             //  just that space was forgotten and there really is a system
02791             //  id to follow.
02792             //
02793             //  So treat it like missing whitespace if so and keep going.
02794             //  Else, just return success.
02795             //
02796             if (bIsQuote)
02797                 fScanner->emitError(XMLErrs::ExpectedWhitespace);
02798              else
02799                 return true;
02800         }
02801     }
02802 
02803     if (bIsQuote) {
02804         // there is a quote coming, scan the system literal
02805         if (!scanSystemLiteral(sysIdToFill))
02806             return false;
02807     }
02808     else {
02809         // no quote, if expecting exteral id, this is an error
02810         if (whatKind == IDType_External)
02811             fScanner->emitError(XMLErrs::ExpectedQuotedString);
02812     }
02813 
02814     return true;
02815 }
02816 
02817 
02818 //
02819 //  This method will scan the contents of an ignored section. It assumes that
02820 //  we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
02821 //  we have to just scan until we see a matching ]]> closing markup.
02822 //
02823 void DTDScanner::scanIgnoredSection()
02824 {
02825     //
02826     //  Depth starts at one because we are already in one section and want
02827     //  to parse until we hit its end.
02828     //
02829     unsigned long depth = 1;
02830     bool gotLeadingSurrogate = false;
02831     while (true)
02832     {
02833         const XMLCh nextCh = fReaderMgr->getNextChar();
02834 
02835         if (!nextCh)
02836             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
02837 
02838         if (nextCh == chOpenAngle)
02839         {
02840             if (fReaderMgr->skippedChar(chBang)
02841             &&  fReaderMgr->skippedChar(chOpenSquare))
02842             {
02843                 depth++;
02844             }
02845         }
02846          else if (nextCh == chCloseSquare)
02847         {
02848             if (fReaderMgr->skippedChar(chCloseSquare))
02849             {
02850                 while (fReaderMgr->skippedChar(chCloseSquare))
02851                 {
02852                     // Do nothing, just skip them
02853                 }
02854 
02855                 if (fReaderMgr->skippedChar(chCloseAngle))
02856                 {
02857                     depth--;
02858                     if (!depth)
02859                         break;
02860                 }
02861             }
02862         }
02863         // Deal with surrogate pairs
02864         else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
02865         {
02866             //  Its a leading surrogate. If we already got one, then
02867             //  issue an error, else set leading flag to make sure that
02868             //  we look for a trailing next time.
02869             if (gotLeadingSurrogate)
02870                 fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
02871             else
02872                 gotLeadingSurrogate = true;
02873         }
02874         else
02875         {
02876             //  If its a trailing surrogate, make sure that we are
02877             //  prepared for that. Else, its just a regular char so make
02878             //  sure that we were not expected a trailing surrogate.
02879             if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
02880             {
02881                 // Its trailing, so make sure we were expecting it
02882                 if (!gotLeadingSurrogate)
02883                     fScanner->emitError(XMLErrs::Unexpected2ndSurrogateChar);
02884             }
02885             else
02886             {
02887                 //  Its just a char, so make sure we were not expecting a
02888                 //  trailing surrogate.
02889                 if (gotLeadingSurrogate)
02890                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
02891 
02892                 // Its got to at least be a valid XML character
02893                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh))
02894                 {
02895                     XMLCh tmpBuf[9];
02896                     XMLString::binToText
02897                     (
02898                         nextCh
02899                         , tmpBuf
02900                         , 8
02901                         , 16
02902                         , fMemoryManager
02903                     );
02904                     fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
02905                 }
02906             }
02907             gotLeadingSurrogate = false;
02908         }
02909     }
02910 }
02911 
02912 
02913 //
02914 //  This method scans the entire internal subset. All we can have here is
02915 //  decl markup, and PE references. The expanded PE references must contain
02916 //  whole markup, so we don't have to worry about their content at this
02917 //  level. We just scan them, expand them, push them, and parse their content
02918 //  right there, via the expandERef() method.
02919 //
02920 bool DTDScanner::scanInternalSubset()
02921 {
02922     // Indicate we are in the internal subset now
02923     FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
02924 
02925     // If we have a doc type handler, tell it the internal subset starts
02926     if (fDocTypeHandler)
02927         fDocTypeHandler->startIntSubset();
02928 
02929     // Get a buffer for whitespace
02930     XMLBufBid bbSpace(fBufMgr);
02931 
02932     bool noErrors = true;
02933     while (true)
02934     {
02935         const XMLCh nextCh = fReaderMgr->peekNextChar();
02936 
02937         //
02938         //  If we get an end of file marker, just unget it and return a
02939         //  failure status. The caller will then see the end of file and
02940         //  faill out correctly.
02941         //
02942         if (!nextCh)
02943             return false;
02944 
02945         // Watch for the end of internal subset marker
02946         if (nextCh == chCloseSquare)
02947         {
02948             fReaderMgr->getNextChar();
02949             break;
02950         }
02951 
02952         if (nextCh == chPercent)
02953         {
02954             //
02955             //  Expand (and scan if external) the reference value. Tell
02956             //  it to set the reader to cause an end of entity exception
02957             //  when this reader dies, which is what the scanExtSubset
02958             //  method wants (who is called to scan this.)
02959             //
02960             fReaderMgr->getNextChar();
02961             expandPERef(true, false, false, true);
02962         }
02963          else if (nextCh == chOpenAngle)
02964         {
02965             // Remember this reader before we start the scan, for checking
02966             // XML 1.0 P28a Well-formedness constraint: PE Between Declarations
02967             const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
02968             bool wasInPE = (fReaderMgr->getCurrentReader()->getType() == XMLReader::Type_PE);
02969 
02970             // And scan this markup
02971             fReaderMgr->getNextChar();
02972             scanMarkupDecl(false);
02973 
02974             // If we did not get back to entry level, then partial markup
02975             if (fReaderMgr->getCurrentReaderNum() != orgReader) {
02976                 if (wasInPE)
02977                     fScanner->emitError(XMLErrs::PEBetweenDecl);
02978                 else if (fScanner->getValidationScheme() == XMLScanner::Val_Always)
02979                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
02980             }
02981         }
02982          else if (fReaderMgr->getCurrentReader()->isWhitespace(nextCh))
02983         {
02984             //
02985             //  IF we are doing advanced callbacks and have a doc type
02986             //  handler, then get the whitespace and call the doc type
02987             //  handler with it. Otherwise, just skip whitespace.
02988             //
02989             if (fDocTypeHandler)
02990             {
02991                 fReaderMgr->getSpaces(bbSpace.getBuffer());
02992                 fDocTypeHandler->doctypeWhitespace
02993                 (
02994                     bbSpace.getRawBuffer()
02995                     , bbSpace.getLen()
02996                 );
02997             }
02998              else
02999             {
03000                 fReaderMgr->skipPastSpaces();
03001             }
03002         }
03003          else
03004         {
03005             // Not valid, so emit an error
03006             XMLCh tmpBuf[9];
03007             XMLString::binToText
03008             (
03009                 fReaderMgr->getNextChar()
03010                 , tmpBuf
03011                 , 8
03012                 , 16
03013                 , fMemoryManager
03014             );
03015             fScanner->emitError
03016             (
03017                 XMLErrs::InvalidCharacterInIntSubset
03018                 , tmpBuf
03019             );
03020 
03021             //
03022             //  If an '>', then probably an abnormally terminated
03023             //  internal subset so just return.
03024             //
03025             if (nextCh == chCloseAngle)
03026             {
03027                 noErrors = false;
03028                 break;
03029             }
03030 
03031             //
03032             //  Otherwise, try to sync back up by scanning forward for
03033             //  a reasonable start character.
03034             //
03035             static const XMLCh toSkip[] =
03036             {
03037                 chPercent, chCloseSquare, chOpenAngle, chNull
03038             };
03039             fReaderMgr->skipUntilInOrWS(toSkip);
03040         }
03041     }
03042 
03043     // If we have a doc type handler, tell it the internal subset ends
03044     if (fDocTypeHandler)
03045         fDocTypeHandler->endIntSubset();
03046 
03047     return noErrors;
03048 }
03049 
03050 
03051 //
03052 //  This method is called once we see a < in the input of an int/ext subset,
03053 //  which indicates the start of some sort of markup.
03054 //
03055 void DTDScanner::scanMarkupDecl(const bool parseTextDecl)
03056 {
03057     //
03058     //  We only have two valid first characters here. One is a ! which opens
03059     //  some markup decl. The other is a ?, which could begin either a PI
03060     //  or a text decl. If parseTextDecl is false, we cannot accept a text
03061     //  decl.
03062     //
03063     const XMLCh nextCh = fReaderMgr->getNextChar();
03064 
03065     if (nextCh == chBang)
03066     {
03067         if (fReaderMgr->skippedChar(chDash))
03068         {
03069             if (fReaderMgr->skippedChar(chDash))
03070             {
03071                 scanComment();
03072             }
03073              else
03074             {
03075                 fScanner->emitError(XMLErrs::CommentsMustStartWith);
03076                 fReaderMgr->skipPastChar(chCloseAngle);
03077             }
03078         }
03079          else if (fReaderMgr->skippedChar(chOpenSquare))
03080         {
03081             //
03082             //  Its a conditional section. This is only valid in the external
03083             //  subset, so issue an error if we aren't there.
03084             //
03085             if (fInternalSubset)
03086             {
03087                 fScanner->emitError(XMLErrs::ConditionalSectInIntSubset);
03088                 fReaderMgr->skipPastChar(chCloseAngle);
03089                 return;
03090             }
03091 
03092             // A PE ref can happen here, but space is not required
03093             checkForPERef(false, true);
03094 
03095             if (fReaderMgr->skippedString(XMLUni::fgIncludeString))
03096             {
03097                 checkForPERef(false, true);
03098 
03099                 // Check for the following open square bracket
03100                 if (!fReaderMgr->skippedChar(chOpenSquare))
03101                     fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
03102 
03103                 // Get the reader we started this on
03104                 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
03105 
03106                 checkForPERef(false, true);
03107 
03108                 //
03109                 //  Recurse back to the ext subset call again, telling it its
03110                 //  in an include section.
03111                 //
03112                 scanExtSubsetDecl(true, false);
03113 
03114                 //
03115                 //  And see if we got back to the same level. If not, then its
03116                 //  a partial markup error.
03117                 //
03118                 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
03119                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
03120 
03121             }
03122              else if (fReaderMgr->skippedString(XMLUni::fgIgnoreString))
03123             {
03124                 checkForPERef(false, true);
03125 
03126                 // Check for the following open square bracket
03127                 if (!fReaderMgr->skippedChar(chOpenSquare))
03128                     fScanner->emitError(XMLErrs::ExpectedINCLUDEBracket);
03129 
03130                 // Get the reader we started this on
03131                 const XMLSize_t orgReader = fReaderMgr->getCurrentReaderNum();
03132 
03133                 // And scan over the ignored part
03134                 scanIgnoredSection();
03135 
03136                 //
03137                 //  And see if we got back to the same level. If not, then its
03138                 //  a partial markup error.
03139                 //
03140                 if (fReaderMgr->getCurrentReaderNum() != orgReader && fScanner->getValidationScheme() == XMLScanner::Val_Always)
03141                     fScanner->getValidator()->emitError(XMLValid::PartialMarkupInPE);
03142 
03143             }
03144              else
03145             {
03146                 fScanner->emitError(XMLErrs::ExpectedIncOrIgn);
03147                 fReaderMgr->skipPastChar(chCloseAngle);
03148             }
03149         }
03150          else if (fReaderMgr->skippedString(XMLUni::fgAttListString))
03151         {
03152             scanAttListDecl();
03153         }
03154          else if (fReaderMgr->skippedString(XMLUni::fgElemString))
03155         {
03156             scanElementDecl();
03157         }
03158          else if (fReaderMgr->skippedString(XMLUni::fgEntityString))
03159         {
03160             scanEntityDecl();
03161         }
03162          else if (fReaderMgr->skippedString(XMLUni::fgNotationString))
03163         {
03164             scanNotationDecl();
03165         }
03166          else
03167         {
03168             fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
03169             fReaderMgr->skipPastChar(chCloseAngle);
03170         }
03171     }
03172      else if (nextCh == chQuestion)
03173     {
03174         // It could be a PI or the XML declaration. Check for Decl
03175         if (fScanner->checkXMLDecl(false))
03176         {
03177             // If we are not accepting text decls, its an error
03178             if (parseTextDecl)
03179             {
03180                 scanTextDecl();
03181             }
03182              else
03183             {
03184                 // Emit the error and skip past this markup
03185                 fScanner->emitError(XMLErrs::TextDeclNotLegalHere);
03186                 fReaderMgr->skipPastChar(chCloseAngle);
03187             }
03188         }
03189          else
03190         {
03191             // It has to be a PI
03192             scanPI();
03193         }
03194     }
03195      else
03196     {
03197         // Can't be valid so emit error and try to skip past end of this decl
03198         fScanner->emitError(XMLErrs::ExpectedMarkupDecl);
03199         fReaderMgr->skipPastChar(chCloseAngle);
03200     }
03201 }
03202 
03203 
03204 //
03205 //  This method is called for a mixed model element's content mode. We've
03206 //  already scanned past the '(PCDATA' part by the time we get here. So
03207 //  everything else is element names separated by | characters until we
03208 //  hit the end. The passed element decl's content model is filled in with
03209 //  the information found.
03210 //
03211 bool DTDScanner::scanMixed(DTDElementDecl& toFill)
03212 {
03213     //
03214     //  The terminating star is only required if there is something more
03215     //  than (PCDATA).
03216     //
03217     bool starRequired = false;
03218 
03219     // Get a buffer to be used below to get element names
03220     XMLBufBid bbName(fBufMgr);
03221     XMLBuffer& nameBuf = bbName.getBuffer();
03222 
03223     //
03224     //  Create an initial content spec node. Its just a leaf node with a
03225     //  PCDATA element id. This current node pointer will be pushed down the
03226     //  tree as we go.
03227     //
03228     ContentSpecNode* curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
03229     (
03230         new (fGrammarPoolMemoryManager) QName
03231         (
03232             XMLUni::fgZeroLenString
03233             , XMLUni::fgZeroLenString
03234             , XMLElementDecl::fgPCDataElemId
03235             , fGrammarPoolMemoryManager
03236         )
03237         , false
03238         , fGrammarPoolMemoryManager
03239     );
03240 
03241     //
03242     //  Set the initial leaf as the temporary head. If we hit the first choice
03243     //  node, it will be set up here. When done, this is the node that's set
03244     //  as the content spec for the element.
03245     //
03246     ContentSpecNode* headNode = curNode;
03247 
03248     // Remember the original node so we can sense the first choice node
03249     ContentSpecNode* orgNode = curNode;
03250 
03251     //
03252     //  We just loop around, getting the | character at the top and then
03253     //  looking for the next element name. We keep up with the last node
03254     //  and add each new one to its right node.
03255     //
03256     while (true)
03257     {
03258         //
03259         //  First of all we check for some grunt work details of skipping
03260         //  whitespace, expand PE refs, and catching invalid reps.
03261         //
03262         if (fReaderMgr->lookingAtChar(chPercent))
03263         {
03264             // Expand it and continue
03265             checkForPERef(false, true);
03266         }
03267          else if (fReaderMgr->skippedChar(chAsterisk))
03268         {
03269             //
03270             //  Tell them they can't have reps in mixed model, but eat
03271             //  it and keep going if we are allowed to.
03272             //
03273             if (fScanner->emitErrorWillThrowException(XMLErrs::NoRepInMixed))
03274             {
03275                 delete headNode;
03276             }
03277             fScanner->emitError(XMLErrs::NoRepInMixed);
03278         }
03279          else if (fReaderMgr->skippedSpace())
03280         {
03281             // Spaces are ok at this point, just eat them and continue
03282             fReaderMgr->skipPastSpaces();
03283         }
03284          else
03285         {
03286             if (!fReaderMgr->skippedChar(chPipe))
03287             {
03288                 // Has to be the closing paren now.
03289                 if (!fReaderMgr->skippedChar(chCloseParen))
03290                 {
03291                     delete headNode;
03292                     fScanner->emitError(XMLErrs::UnterminatedContentModel, toFill.getElementName()->getLocalPart());                     
03293                     return false;
03294                 }
03295 
03296                 bool starSkipped = true;
03297                 if (!fReaderMgr->skippedChar(chAsterisk)) {
03298 
03299                     starSkipped = false;
03300 
03301                     if (starRequired)
03302                     {
03303                         if (fScanner->emitErrorWillThrowException(XMLErrs::ExpectedAsterisk))
03304                         {
03305                             delete headNode;
03306                         }
03307                         fScanner->emitError(XMLErrs::ExpectedAsterisk);
03308                     }
03309                 }
03310 
03311                 //
03312                 //  Create a zero or more node and make the original head
03313                 //  node its first child.
03314                 //
03315                 if (starRequired || starSkipped) {
03316                     headNode = new (fGrammarPoolMemoryManager) ContentSpecNode
03317                     (
03318                         ContentSpecNode::ZeroOrMore
03319                         , headNode
03320                         , 0
03321                         , true
03322                         , true
03323                         , fGrammarPoolMemoryManager
03324                     );
03325                 }
03326 
03327                 // Store the head node as the content spec of the element.
03328                 toFill.setContentSpec(headNode);
03329                 break;
03330             }
03331 
03332             // Its more than just a PCDATA, so an ending star will be required now
03333             starRequired = true;
03334 
03335             // Space is legal here so check for a PE ref, but don't require space
03336             checkForPERef(false, true);
03337 
03338             // Get a name token
03339             if (!fReaderMgr->getName(nameBuf))
03340             {
03341                 delete headNode;
03342                 fScanner->emitError(XMLErrs::ExpectedElementName);
03343                 return false;
03344             }
03345 
03346             //
03347             //  Create a leaf node for it. If we can find the element id for
03348             //  this element, then use it. Else, we have to fault in an element
03349             //  decl, marked as created because of being in a content model.
03350             //
03351             XMLElementDecl* decl = fDTDGrammar->getElemDecl(fEmptyNamespaceId, 0, nameBuf.getRawBuffer(), Grammar::TOP_LEVEL_SCOPE);
03352             if (!decl)
03353             {
03354                 decl = new (fGrammarPoolMemoryManager) DTDElementDecl
03355                 (
03356                     nameBuf.getRawBuffer()
03357                     , fEmptyNamespaceId
03358                     , DTDElementDecl::Any
03359                     , fGrammarPoolMemoryManager
03360                 );
03361                 decl->setCreateReason(XMLElementDecl::InContentModel);
03362                 decl->setExternalElemDeclaration(isReadingExternalEntity());
03363                 fDTDGrammar->putElemDecl(decl);
03364             }
03365 
03366             //
03367             //  If the current node is the original node, this is the first choice
03368             //  node, so create an initial choice node with the current node and
03369             //  the new element id. Store this as the head node.
03370             //
03371             //  Otherwise, we have to steal the right node of the previous choice
03372             //  and weave in another choice node there, which has the old choice
03373             //  as its left and the new leaf as its right.
03374             //
03375             if (curNode == orgNode)
03376             {
03377                 curNode = new (fGrammarPoolMemoryManager) ContentSpecNode
03378                 (
03379                     ContentSpecNode::Choice
03380                     , curNode
03381                     , new (fGrammarPoolMemoryManager) ContentSpecNode
03382                       (
03383                           decl->getElementName()
03384                           , fGrammarPoolMemoryManager
03385                       )
03386                     , true
03387                     , true
03388                     , fGrammarPoolMemoryManager
03389                 );
03390 
03391                 // Remember the top node
03392                 headNode = curNode;
03393             }
03394              else
03395             {
03396                 ContentSpecNode* oldRight = curNode->orphanSecond();
03397                 curNode->setSecond
03398                 (
03399                     new (fGrammarPoolMemoryManager) ContentSpecNode
03400                     (
03401                         ContentSpecNode::Choice
03402                         , oldRight
03403                         , new (fGrammarPoolMemoryManager) ContentSpecNode
03404                           (
03405                               decl->getElementName()
03406                               , fGrammarPoolMemoryManager
03407                           )
03408                         , true
03409                         , true
03410                         , fGrammarPoolMemoryManager
03411                     )
03412                 );
03413 
03414                 // Make the new right node the current node
03415                 curNode = curNode->getSecond();
03416             }
03417         }
03418     }
03419 
03420     return true;
03421 }
03422 
03423 
03424 //
03425 //  This method is called when we see a '<!NOTATION' string while scanning
03426 //  markup decl. It parses out the notation and its id and stores a new
03427 //  notation decl object in the notation decl pool.
03428 //
03429 void DTDScanner::scanNotationDecl()
03430 {
03431     // Space is required here so check for a PE ref, and require space
03432     if (!checkForPERef(false, true))
03433     {
03434         fScanner->emitError(XMLErrs::ExpectedWhitespace);
03435         fReaderMgr->skipPastChar(chCloseAngle);
03436         return;
03437     }
03438 
03439     //
03440     //  And now we get a name, which is the name of the notation. Get a
03441     //  buffer for the name.
03442     //
03443     XMLBufBid bbName(fBufMgr);
03444     if (!fReaderMgr->getName(bbName.getBuffer()))
03445     {
03446         fScanner->emitError(XMLErrs::ExpectedNotationName);
03447         fReaderMgr->skipPastChar(chCloseAngle);
03448         return;
03449     }
03450 
03451     // If namespaces are enabled, then no colons allowed
03452     if (fScanner->getDoNamespaces())
03453     {
03454         if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
03455             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
03456     }
03457 
03458     // Space is required here so check for a PE ref, and require space
03459     if (!checkForPERef(false, true))
03460     {
03461         fScanner->emitError(XMLErrs::ExpectedWhitespace);
03462         fReaderMgr->skipPastChar(chCloseAngle);
03463         return;
03464     }
03465 
03466     //
03467     //  And scan an external or public id. We need buffers to use for both
03468     //  of these.
03469     //
03470     XMLBufBid bbPubId(fBufMgr);
03471     XMLBufBid bbSysId(fBufMgr);
03472     if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
03473     {
03474         fReaderMgr->skipPastChar(chCloseAngle);
03475         return;
03476     }
03477 
03478     // We can have an optional space or PE ref here
03479     checkForPERef(false, true);
03480 
03481     //
03482     //  See if it already exists. If so, add it to the notatino decl pool.
03483     //  Otherwise, if advanced callbacks are on, create a temp one and
03484     //  call out for that one.
03485     //
03486     XMLNotationDecl* decl = fDTDGrammar->getNotationDecl(bbName.getRawBuffer());
03487     bool isIgnoring = (decl != 0);
03488     if (isIgnoring)
03489     {
03490         fScanner->emitError(XMLErrs::NotationAlreadyExists, bbName.getRawBuffer());
03491     }
03492      else
03493     {
03494         // Fill in a new notation declaration and add it to the pool
03495         const XMLCh* publicId = bbPubId.getRawBuffer();
03496         const XMLCh* systemId = bbSysId.getRawBuffer();
03497         ReaderMgr::LastExtEntityInfo lastInfo;
03498         fReaderMgr->getLastExtEntityInfo(lastInfo);
03499 
03500         decl = new (fGrammarPoolMemoryManager) XMLNotationDecl
03501         (
03502             bbName.getRawBuffer()
03503             , (publicId && *publicId) ? publicId : 0
03504             , (systemId && *systemId) ? systemId : 0
03505             , (lastInfo.systemId && *lastInfo.systemId) ? lastInfo.systemId : 0
03506             , fGrammarPoolMemoryManager
03507         );
03508         fDTDGrammar->putNotationDecl(decl);
03509     }
03510 
03511     //
03512     //  If we have a document type handler, then tell it about this. If we
03513     //  are ignoring it, only call out if advanced callbacks are enabled.
03514     //
03515     if (fDocTypeHandler)
03516     {
03517         fDocTypeHandler->notationDecl
03518         (
03519             *decl
03520             , isIgnoring
03521         );
03522     }
03523 
03524     // And one more optional space or PE ref
03525     checkForPERef(false, true);
03526 
03527     // And skip the terminating bracket
03528     if (!fReaderMgr->skippedChar(chCloseAngle))
03529         fScanner->emitError(XMLErrs::UnterminatedNotationDecl);
03530 }
03531 
03532 
03533 //
03534 //  Scans a PI and calls the appropriate callbacks. A PI can happen in either
03535 //  the document or the DTD, so it calls the appropriate handler according
03536 //  to the fInDocument flag.
03537 //
03538 //  At entry we have just scanned the <? part, and need to now start on the
03539 //  PI target name.
03540 //
03541 void DTDScanner::scanPI()
03542 {
03543     const XMLCh* namePtr = 0;
03544     const XMLCh* targetPtr = 0;
03545 
03546     //
03547     //  If there are any spaces here, then warn about it. If we aren't in
03548     //  'first error' mode, then we'll come back and can easily pick up
03549     //  again by just skipping them.
03550     //
03551     if (fReaderMgr->lookingAtSpace())
03552     {
03553         fScanner->emitError(XMLErrs::PINameExpected);
03554         fReaderMgr->skipPastSpaces();
03555     }
03556 
03557     // Get a buffer for the PI name and scan it in
03558     XMLBufBid bbName(fBufMgr);
03559     if (!fReaderMgr->getName(bbName.getBuffer()))
03560     {
03561         fScanner->emitError(XMLErrs::PINameExpected);
03562         fReaderMgr->skipPastChar(chCloseAngle);
03563         return;
03564     }
03565 
03566     // Point the name pointer at the raw data
03567     namePtr = bbName.getRawBuffer();
03568 
03569     // See if it issome form of 'xml' and emit a warning
03570     //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
03571     if (bbName.getLen() == 3 &&
03572         (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
03573          ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
03574          ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))       
03575         fScanner->emitError(XMLErrs::NoPIStartsWithXML);
03576 
03577     // If namespaces are enabled, then no colons allowed
03578     if (fScanner->getDoNamespaces())
03579     {
03580         if (XMLString::indexOf(namePtr, chColon) != -1)
03581             fScanner->emitError(XMLErrs::ColonNotLegalWithNS);
03582     }
03583 
03584     //
03585     //  If we don't hit a space next, then the PI has no target. If we do
03586     //  then get out the target. Get a buffer for it as well
03587     //
03588     XMLBufBid bbTarget(fBufMgr);
03589     if (fReaderMgr->skippedSpace())
03590     {
03591         // Skip any leading spaces
03592         fReaderMgr->skipPastSpaces();
03593 
03594         bool gotLeadingSurrogate = false;
03595 
03596         // It does have a target, so lets move on to deal with that.
03597         while (1)
03598         {
03599             const XMLCh nextCh = fReaderMgr->getNextChar();
03600 
03601             // Watch for an end of file, which is always bad here
03602             if (!nextCh)
03603             {
03604                 fScanner->emitError(XMLErrs::UnterminatedPI);
03605                 ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
03606             }
03607 
03608             // Watch for potential terminating character
03609             if (nextCh == chQuestion)
03610             {
03611                 // It must be followed by '>' to be a termination of the target
03612                 if (fReaderMgr->skippedChar(chCloseAngle))
03613                     break;
03614             }
03615 
03616             // Check for correct surrogate pairs
03617             if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
03618             {
03619                 if (gotLeadingSurrogate)
03620                     fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
03621                 else
03622                     gotLeadingSurrogate = true;
03623             }
03624              else
03625             {
03626                 if (gotLeadingSurrogate)
03627                 {
03628                     if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
03629                         fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
03630                 }
03631                 // Its got to at least be a valid XML character
03632                 else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) {
03633 
03634                     XMLCh tmpBuf[9];
03635                     XMLString::binToText
03636                     (
03637                         nextCh
03638                         , tmpBuf
03639                         , 8
03640                         , 16
03641                         , fMemoryManager
03642                     );
03643                     fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
03644                 }
03645 
03646                 gotLeadingSurrogate = false;
03647             }
03648             bbTarget.append(nextCh);
03649         }
03650     }
03651      else
03652     {
03653         // No target, but make sure its terminated ok
03654         if (!fReaderMgr->skippedChar(chQuestion))
03655         {
03656             fScanner->emitError(XMLErrs::UnterminatedPI);
03657             fReaderMgr->skipPastChar(chCloseAngle);
03658             return;
03659         }
03660 
03661         if (!fReaderMgr->skippedChar(chCloseAngle))
03662         {
03663             fScanner->emitError(XMLErrs::UnterminatedPI);
03664             fReaderMgr->skipPastChar(chCloseAngle);
03665             return;
03666         }
03667     }
03668 
03669     // Point the target pointer at the raw data
03670     targetPtr = bbTarget.getRawBuffer();
03671 
03672     //
03673     //  If we have a handler, then call it.
03674     //
03675     if (fDocTypeHandler)
03676     {
03677         fDocTypeHandler->doctypePI
03678         (
03679             namePtr
03680             , targetPtr
03681         );
03682     }
03683 }
03684 
03685 
03686 //
03687 //  This method scans a public literal. It must be quoted and all of its
03688 //  characters must be valid public id characters. The quotes are discarded
03689 //  and the results are returned.
03690 //
03691 bool DTDScanner::scanPublicLiteral(XMLBuffer& toFill)
03692 {
03693     toFill.reset();
03694 
03695     // Get the next char which must be a single or double quote
03696     XMLCh quoteCh;
03697     if (!fReaderMgr->skipIfQuote(quoteCh)) {
03698         fScanner->emitError(XMLErrs::ExpectedQuotedString);
03699         return false;
03700     }
03701 
03702     while (true)
03703     {
03704         const XMLCh nextCh = fReaderMgr->getNextChar();
03705 
03706         // Watch for EOF
03707         if (!nextCh)
03708             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
03709 
03710         if (nextCh == quoteCh)
03711             break;
03712 
03713         //
03714         //  If its not a valid public id char, then report it but keep going
03715         //  since that's the best recovery scheme.
03716         //
03717         if (!fReaderMgr->getCurrentReader()->isPublicIdChar(nextCh))
03718         {
03719             XMLCh tmpBuf[9];
03720             XMLString::binToText
03721             (
03722                 nextCh
03723                 , tmpBuf
03724                 , 8
03725                 , 16
03726                 , fMemoryManager
03727             );
03728             fScanner->emitError(XMLErrs::InvalidPublicIdChar, tmpBuf);
03729         }
03730 
03731         toFill.append(nextCh);
03732     }
03733     return true;
03734 }
03735 
03736 
03737 //
03738 //  This method handles scanning in a quoted system literal. It expects to
03739 //  start on the open quote and returns after eating the ending quote. There
03740 //  are not really any restrictions on the contents of system literals.
03741 //
03742 bool DTDScanner::scanSystemLiteral(XMLBuffer& toFill)
03743 {
03744     toFill.reset();
03745 
03746     // Get the next char which must be a single or double quote
03747     XMLCh quoteCh;
03748     if (!fReaderMgr->skipIfQuote(quoteCh)) {
03749         fScanner->emitError(XMLErrs::ExpectedQuotedString);
03750         return false;
03751     }
03752 
03753         XMLCh nextCh;
03754     // Break out on terminating quote
03755     while ((nextCh=fReaderMgr->getNextChar())!=quoteCh)
03756     {
03757         // Watch for EOF
03758         if (!nextCh)
03759             ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
03760         toFill.append(nextCh);
03761     }
03762     return true;
03763 }
03764 
03765 
03766 
03767 //
03768 //  This method is called to scan a text decl line, which can be the first
03769 //  line in an external entity or external subset.
03770 //
03771 //  On entry the <? has been scanned, and next should be 'xml' followed by
03772 //  some whitespace, version string, etc...
03773 //    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
03774 //
03775 void DTDScanner::scanTextDecl()
03776 {
03777     // Skip any subsequent whitespace before the version string
03778     fReaderMgr->skipPastSpaces();
03779 
03780     // Next should be the version string
03781     XMLBufBid bbVersion(fBufMgr);
03782     if (fReaderMgr->skippedString(XMLUni::fgVersionString))
03783     {
03784         if (!scanEq())
03785         {
03786             fScanner->emitError(XMLErrs::ExpectedEqSign);
03787             fReaderMgr->skipPastChar(chCloseAngle);
03788             return;
03789         }
03790 
03791         //
03792         //  Followed by a single or double quoted version. Get a buffer for
03793         //  the string.
03794         //
03795         if (!getQuotedString(bbVersion.getBuffer()))
03796         {
03797             fScanner->emitError(XMLErrs::BadXMLVersion);
03798             fReaderMgr->skipPastChar(chCloseAngle);
03799             return;
03800         }
03801 
03802         // If its not our supported version, issue an error but continue
03803         if (XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_1)) {
03804             if (fScanner->getXMLVersion() != XMLReader::XMLV1_1)
03805                     fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
03806         }
03807         else if (!XMLString::equals(bbVersion.getRawBuffer(), XMLUni::fgVersion1_0))
03808             fScanner->emitError(XMLErrs::UnsupportedXMLVersion, bbVersion.getRawBuffer());
03809     }
03810 
03811     // Ok, now we must have an encoding string
03812     XMLBufBid bbEncoding(fBufMgr);
03813     fReaderMgr->skipPastSpaces();
03814     bool gotEncoding = false;
03815     if (fReaderMgr->skippedString(XMLUni::fgEncodingString))
03816     {
03817         // There must be a equal sign next
03818         if (!scanEq())
03819         {
03820             fScanner->emitError(XMLErrs::ExpectedEqSign);
03821             fReaderMgr->skipPastChar(chCloseAngle);
03822             return;
03823         }
03824 
03825         // Followed by a single or double quoted version string
03826         getQuotedString(bbEncoding.getBuffer());
03827         if (bbEncoding.isEmpty() || !XMLString::isValidEncName(bbEncoding.getRawBuffer()))
03828         {
03829             fScanner->emitError(XMLErrs::BadXMLEncoding, bbEncoding.getRawBuffer());
03830             fReaderMgr->skipPastChar(chCloseAngle);
03831             return;
03832         }
03833 
03834         // Indicate that we got an encoding
03835         gotEncoding = true;
03836     }
03837 
03838     //
03839     // Encoding declarations are required in the external entity
03840     // if there is a text declaration present
03841     //
03842     if (!gotEncoding)
03843     {
03844       fScanner->emitError(XMLErrs::EncodingRequired);
03845       fReaderMgr->skipPastChar(chCloseAngle);
03846       return;
03847 
03848     }
03849 
03850     fReaderMgr->skipPastSpaces();
03851     if (!fReaderMgr->skippedChar(chQuestion))
03852     {
03853         fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
03854         fReaderMgr->skipPastChar(chCloseAngle);
03855     }
03856      else if (!fReaderMgr->skippedChar(chCloseAngle))
03857     {
03858         fScanner->emitError(XMLErrs::UnterminatedXMLDecl);
03859         fReaderMgr->skipPastChar(chCloseAngle);
03860     }
03861 
03862     //
03863     //  If we have a document type handler and advanced callbacks are on,
03864     //  then call the TextDecl callback
03865     //
03866     if (fDocTypeHandler)
03867     {
03868         fDocTypeHandler->TextDecl
03869         (
03870             bbVersion.getRawBuffer()
03871             , bbEncoding.getRawBuffer()
03872         );
03873     }
03874 
03875     //
03876     //  If we got an encoding string, then we have to call back on the reader
03877     //  to tell it what the encoding is.
03878     //
03879     if (!bbEncoding.isEmpty())
03880     {
03881         if (!fReaderMgr->getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
03882             fScanner->emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
03883     }
03884 }
03885 
03886 XERCES_CPP_NAMESPACE_END