GME  13
XMLReader.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  *
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: XMLReader.cpp 901280 2010-01-20 17:06:14Z johns $
00020  */
00021 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/internal/XMLReader.hpp>
00026 #include <xercesc/util/BitOps.hpp>
00027 #include <xercesc/util/BinInputStream.hpp>
00028 #include <xercesc/util/PlatformUtils.hpp>
00029 #include <xercesc/util/RuntimeException.hpp>
00030 #include <xercesc/util/TransService.hpp>
00031 #include <xercesc/util/XMLEBCDICTranscoder.hpp>
00032 #include <xercesc/util/XMLString.hpp>
00033 #include <xercesc/util/Janitor.hpp>
00034 
00035 XERCES_CPP_NAMESPACE_BEGIN
00036 
00037 // ---------------------------------------------------------------------------
00038 //  XMLReader: Query Methods
00039 // ---------------------------------------------------------------------------
00040 //  Checks whether all of the chars in the passed buffer are whitespace or
00041 //  not. Breaks out on the first non-whitespace.
00042 //
00043 bool XMLReader::isAllSpaces(const   XMLCh* const    toCheck
00044                             , const XMLSize_t       count) const
00045 {
00046     const XMLCh* curCh = toCheck;
00047     const XMLCh* endPtr = toCheck + count;
00048     while (curCh < endPtr)
00049     {
00050         if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
00051             return false;
00052     }
00053     return true;
00054 }
00055 
00056 
00057 //
00058 //  Checks whether at least one of the chars in the passed buffer are whitespace or
00059 //  not.
00060 //
00061 bool XMLReader::containsWhiteSpace(const   XMLCh* const    toCheck
00062                             , const XMLSize_t     count) const
00063 {
00064     const XMLCh* curCh = toCheck;
00065     const XMLCh* endPtr = toCheck + count;
00066     while (curCh < endPtr)
00067     {
00068         if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
00069             return true;
00070     }
00071     return false;
00072 }
00073 
00074 //
00075 //  This one is not called terribly often, so call the XMLChar utility
00076 //
00077 bool XMLReader::isPublicIdChar(const XMLCh toCheck) const
00078 {
00079     if (fXMLVersion == XMLV1_1)
00080         return XMLChar1_1::isPublicIdChar(toCheck);
00081     else
00082         return XMLChar1_0::isPublicIdChar(toCheck);
00083 }
00084 
00085 // ---------------------------------------------------------------------------
00086 //  XMLReader: Constructors and Destructor
00087 // ---------------------------------------------------------------------------
00088 XMLReader::XMLReader(const  XMLCh* const          pubId
00089                     , const XMLCh* const          sysId
00090                     ,       BinInputStream* const streamToAdopt
00091                     , const RefFrom               from
00092                     , const Types                 type
00093                     , const Sources               source
00094                     , const bool                  throwAtEnd
00095                     , const bool                  calculateSrcOfs
00096                     ,       XMLSize_t             lowWaterMark
00097                     , const XMLVersion            version
00098                     ,       MemoryManager* const  manager) :
00099     fCharIndex(0)
00100     , fCharsAvail(0)
00101     , fCurCol(1)
00102     , fCurLine(1)
00103     , fEncodingStr(0)
00104     , fForcedEncoding(false)
00105     , fNoMore(false)
00106     , fPublicId(XMLString::replicate(pubId, manager))
00107     , fRawBufIndex(0)
00108     , fRawBytesAvail(0)
00109     , fLowWaterMark (lowWaterMark)
00110     , fReaderNum(0xFFFFFFFF)
00111     , fRefFrom(from)
00112     , fSentTrailingSpace(false)
00113     , fSource(source)
00114     , fSrcOfsBase(0)
00115     , fSrcOfsSupported(false)
00116     , fCalculateSrcOfs(calculateSrcOfs)
00117     , fSystemId(XMLString::replicate(sysId, manager))
00118     , fStream(streamToAdopt)
00119     , fSwapped(false)
00120     , fThrowAtEnd(throwAtEnd)
00121     , fTranscoder(0)
00122     , fType(type)
00123     , fMemoryManager(manager)
00124 {
00125     setXMLVersion(version);
00126 
00127     // Do an initial load of raw bytes
00128     refreshRawBuffer();
00129 
00130     // Ask the transcoding service if it supports src offset info
00131     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
00132 
00133     //
00134     //  Use the recognizer class to get a basic sense of what family of
00135     //  encodings this file is in. We'll start off with a reader of that
00136     //  type, and update it later if needed when we read the XMLDecl line.
00137     //
00138     fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);
00139 
00140     #if defined(XERCES_DEBUG)
00141     if ((fEncoding < XMLRecognizer::Encodings_Min)
00142     ||  (fEncoding > XMLRecognizer::Encodings_Max))
00143     {
00144         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
00145     }
00146     #endif
00147 
00148     fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
00149 
00150     // Check whether the fSwapped flag should be set or not
00151     checkForSwapped();
00152 
00153     //
00154     //  This will check to see if the first line is an XMLDecl and, if
00155     //  so, decode that first line manually one character at a time. This
00156     //  leaves enough characters in the buffer that the high level code
00157     //  can get through the Decl and call us back with the real encoding.
00158     //
00159     doInitDecode();
00160 
00161     //
00162     //  NOTE: We won't create a transcoder until we either get a call to
00163     //  setEncoding() or we get a call to refreshCharBuffer() and no
00164     //  transcoder has been set yet.
00165     //
00166 }
00167 
00168 
00169 XMLReader::XMLReader(const  XMLCh* const          pubId
00170                     , const XMLCh* const          sysId
00171                     ,       BinInputStream* const streamToAdopt
00172                     , const XMLCh* const          encodingStr
00173                     , const RefFrom               from
00174                     , const Types                 type
00175                     , const Sources               source
00176                     , const bool                  throwAtEnd
00177                     , const bool                  calculateSrcOfs
00178                     ,       XMLSize_t             lowWaterMark
00179                     , const XMLVersion            version
00180                     ,       MemoryManager* const  manager) :
00181     fCharIndex(0)
00182     , fCharsAvail(0)
00183     , fCurCol(1)
00184     , fCurLine(1)
00185     , fEncoding(XMLRecognizer::UTF_8)
00186     , fEncodingStr(0)
00187     , fForcedEncoding(true)
00188     , fNoMore(false)
00189     , fPublicId(XMLString::replicate(pubId, manager))
00190     , fRawBufIndex(0)
00191     , fRawBytesAvail(0)
00192     , fLowWaterMark (lowWaterMark)
00193     , fReaderNum(0xFFFFFFFF)
00194     , fRefFrom(from)
00195     , fSentTrailingSpace(false)
00196     , fSource(source)
00197     , fSrcOfsBase(0)
00198     , fSrcOfsSupported(false)
00199     , fCalculateSrcOfs(calculateSrcOfs)
00200     , fSystemId(XMLString::replicate(sysId, manager))
00201     , fStream(streamToAdopt)
00202     , fSwapped(false)
00203     , fThrowAtEnd(throwAtEnd)
00204     , fTranscoder(0)
00205     , fType(type)
00206     , fMemoryManager(manager)
00207 {
00208     setXMLVersion(version);
00209 
00210     // Do an initial load of raw bytes
00211     refreshRawBuffer();
00212 
00213     // Copy the encoding string to our member
00214     fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager);
00215     XMLString::upperCaseASCII(fEncodingStr);
00216 
00217     // Ask the transcoding service if it supports src offset info
00218     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
00219 
00220     //
00221     //  Map the passed encoding name to one of our enums. If it does not
00222     //  match one of the intrinsic encodings, it will come back 'other',
00223     //  which tells us to create a transcoder based reader.
00224     //
00225     fEncoding = XMLRecognizer::encodingForName(fEncodingStr);
00226 
00227     //  test the presence of the BOM and remove it from the source
00228     switch(fEncoding)
00229     {
00230         case XMLRecognizer::UCS_4B :
00231         case XMLRecognizer::UCS_4L :
00232         {
00233             if (fRawBytesAvail > 4 &&
00234                 (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
00235                  ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)))  )
00236             {
00237                 fRawBufIndex += 4;
00238             }
00239             break;
00240         }
00241         case XMLRecognizer::UTF_8 :
00242         {
00243             // Look at the raw buffer as short chars
00244             const char* asChars = (const char*)fRawByteBuf;
00245 
00246             if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
00247                 XMLString::compareNString(  asChars
00248                                             , XMLRecognizer::fgUTF8BOM
00249                                             , XMLRecognizer::fgUTF8BOMLen) == 0)
00250             {
00251                 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
00252             }
00253             break;
00254         }
00255         case XMLRecognizer::UTF_16B :
00256         case XMLRecognizer::UTF_16L :
00257         {
00258             if (fRawBytesAvail < 2)
00259                 break;
00260 
00261             const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
00262             if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
00263             {
00264                 fRawBufIndex += sizeof(UTF16Ch);
00265             }
00266             break;
00267         }
00268         case XMLRecognizer::EBCDIC:
00269         case XMLRecognizer::US_ASCII:
00270         case XMLRecognizer::XERCES_XMLCH:
00271         case XMLRecognizer::OtherEncoding:
00272         case XMLRecognizer::Encodings_Count:
00273         {
00274             // silence warning about enumeration not being used
00275             break;
00276         }
00277     }
00278 
00279     // Check whether the fSwapped flag should be set or not
00280     checkForSwapped();
00281 
00282     //
00283     //  Create a transcoder for the encoding. Since the encoding has been
00284     //  forced, this will be the one we will use, period.
00285     //
00286     XMLTransService::Codes failReason;
00287     if (fEncoding == XMLRecognizer::OtherEncoding)
00288     {
00289         //
00290         //  fEncodingStr not  pre-recognized, use it
00291         //  directly for transcoder
00292         //
00293         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
00294         (
00295             fEncodingStr
00296             , failReason
00297             , kCharBufSize
00298             , fMemoryManager
00299         );
00300     }
00301      else
00302     {
00303         //
00304         //  Use the recognized fEncoding to create the transcoder
00305         //
00306         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
00307         (
00308             fEncoding
00309             , failReason
00310             , kCharBufSize
00311             , fMemoryManager
00312         );
00313 
00314     }
00315 
00316     if (!fTranscoder)
00317     {
00318         // We are about to throw which means the d-tor won't be called.
00319         // Clean up some memory.
00320         //
00321         fMemoryManager->deallocate(fPublicId);
00322         fMemoryManager->deallocate(fSystemId);
00323         ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);
00324 
00325         ThrowXMLwithMemMgr1
00326         (
00327             TranscodingException
00328             , XMLExcepts::Trans_CantCreateCvtrFor
00329             , fEncodingStr
00330             , fMemoryManager
00331         );
00332     }
00333 
00334     //
00335     //  Note that, unlike above, we do not do an initial decode of the
00336     //  first line. We take the caller's word that the encoding is correct
00337     //  and just assume that the first bulk decode (kicked off by the first
00338     //  get of a character) will work.
00339     //
00340     //  So we do here the slipping in of the leading space if required.
00341     //
00342     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
00343     {
00344         // This represents no data from the source
00345         fCharSizeBuf[fCharsAvail] = 0;
00346         fCharOfsBuf[fCharsAvail] = 0;
00347         fCharBuf[fCharsAvail++] = chSpace;
00348     }
00349 }
00350 
00351 
00352 XMLReader::XMLReader(const  XMLCh* const          pubId
00353                     , const XMLCh* const          sysId
00354                     ,       BinInputStream* const streamToAdopt
00355                     , XMLRecognizer::Encodings    encodingEnum
00356                     , const RefFrom               from
00357                     , const Types                 type
00358                     , const Sources               source
00359                     , const bool                  throwAtEnd
00360                     , const bool                  calculateSrcOfs
00361                     ,       XMLSize_t             lowWaterMark
00362                     , const XMLVersion            version
00363                     ,       MemoryManager* const  manager) :
00364     fCharIndex(0)
00365     , fCharsAvail(0)
00366     , fCurCol(1)
00367     , fCurLine(1)
00368     , fEncoding(XMLRecognizer::UTF_8)
00369     , fEncodingStr(0)
00370     , fForcedEncoding(true)
00371     , fNoMore(false)
00372     , fPublicId(XMLString::replicate(pubId, manager))
00373     , fRawBufIndex(0)
00374     , fRawBytesAvail(0)
00375     , fLowWaterMark (lowWaterMark)
00376     , fReaderNum(0xFFFFFFFF)
00377     , fRefFrom(from)
00378     , fSentTrailingSpace(false)
00379     , fSource(source)
00380     , fSrcOfsBase(0)
00381     , fSrcOfsSupported(false)
00382     , fCalculateSrcOfs(calculateSrcOfs)
00383     , fSystemId(XMLString::replicate(sysId, manager))
00384     , fStream(streamToAdopt)
00385     , fSwapped(false)
00386     , fThrowAtEnd(throwAtEnd)
00387     , fTranscoder(0)
00388     , fType(type)
00389     , fMemoryManager(manager)
00390 {
00391     setXMLVersion(version);
00392 
00393     // Do an initial load of raw bytes
00394     refreshRawBuffer();
00395 
00396     // Ask the transcoding service if it supports src offset info
00397     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
00398 
00399     //
00400     //  Use the passed encoding code
00401     //
00402     fEncoding = encodingEnum;
00403     fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
00404 
00405     // Check whether the fSwapped flag should be set or not
00406     checkForSwapped();
00407 
00408     //
00409     //  Create a transcoder for the encoding. Since the encoding has been
00410     //  forced, this will be the one we will use, period.
00411     //
00412     XMLTransService::Codes failReason;
00413     fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
00414     (
00415         fEncoding
00416         , failReason
00417         , kCharBufSize
00418         , fMemoryManager
00419     );
00420 
00421     if (!fTranscoder)
00422     {
00423         // We are about to throw which means the d-tor won't be called.
00424         // Clean up some memory.
00425         //
00426         fMemoryManager->deallocate(fPublicId);
00427         fMemoryManager->deallocate(fSystemId);
00428         ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);
00429 
00430         ThrowXMLwithMemMgr1
00431         (
00432             TranscodingException
00433             , XMLExcepts::Trans_CantCreateCvtrFor
00434             , fEncodingStr
00435             , fMemoryManager
00436         );
00437     }
00438 
00439     //
00440     //  Note that, unlike above, we do not do an initial decode of the
00441     //  first line. We take the caller's word that the encoding is correct
00442     //  and just assume that the first bulk decode (kicked off by the first
00443     //  get of a character) will work.
00444     //
00445     //  So we do here the slipping in of the leading space if required.
00446     //
00447     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
00448     {
00449         // This represents no data from the source
00450         fCharSizeBuf[fCharsAvail] = 0;
00451         fCharOfsBuf[fCharsAvail] = 0;
00452         fCharBuf[fCharsAvail++] = chSpace;
00453     }
00454 }
00455 
00456 
00457 XMLReader::~XMLReader()
00458 {
00459     fMemoryManager->deallocate(fEncodingStr);
00460     fMemoryManager->deallocate(fPublicId);
00461     fMemoryManager->deallocate(fSystemId);
00462     delete fStream;
00463     delete fTranscoder;
00464 }
00465 
00466 
00467 // ---------------------------------------------------------------------------
00468 //  XMLReader: Character buffer management methods
00469 // ---------------------------------------------------------------------------
00470 XMLFilePos XMLReader::getSrcOffset() const
00471 {
00472     if (!fSrcOfsSupported || !fCalculateSrcOfs)
00473         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager);
00474 
00475     //
00476     //  Take the current source offset and add in the sizes that we've
00477     //  eaten from the source so far.
00478     //
00479     if( fCharIndex == 0 ) {
00480         return fSrcOfsBase;
00481     }
00482 
00483     if( fCharIndex < fCharsAvail ) {
00484 
00485         return (fSrcOfsBase + fCharOfsBuf[fCharIndex]);
00486     }
00487 
00488     return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]);
00489 }
00490 
00491 
00492 bool XMLReader::refreshCharBuffer()
00493 {
00494     // If the no more flag is set, then don't bother doing anything.
00495     if (fNoMore)
00496         return false;
00497 
00498     XMLSize_t startInd;
00499 
00500     // See if we have any existing chars.
00501     const XMLSize_t spareChars = fCharsAvail - fCharIndex;
00502 
00503     // If we are full, then don't do anything.
00504     if (spareChars == kCharBufSize)
00505         return true;
00506 
00507     //
00508     //  If no transcoder has been created yet, then we never saw the
00509     //  any encoding="" string and the encoding was not forced, so lets
00510     //  create one now. We know that it won't change now.
00511     //
00512     //  However, note that if we autosensed EBCDIC, then we have to
00513     //  consider it an error if we never got an encoding since we don't
00514     //  know what variant of EBCDIC it is.
00515     //
00516     if (!fTranscoder)
00517     {
00518         if (fEncoding == XMLRecognizer::EBCDIC)
00519             ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager);
00520 
00521         // Ask the transcoding service to make use a transcoder
00522         XMLTransService::Codes failReason;
00523         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
00524         (
00525             fEncodingStr
00526             , failReason
00527             , kCharBufSize
00528             , fMemoryManager
00529         );
00530 
00531         if (!fTranscoder)
00532         {
00533             ThrowXMLwithMemMgr1
00534             (
00535                 TranscodingException
00536                 , XMLExcepts::Trans_CantCreateCvtrFor
00537                 , fEncodingStr
00538                 , fMemoryManager
00539             );
00540         }
00541     }
00542 
00543     //
00544     //  Add the number of source bytes eaten so far to the base src
00545     //  offset member.
00546     //
00547     if (fCalculateSrcOfs) {
00548         for (startInd = 0; startInd < fCharIndex; startInd++)
00549             fSrcOfsBase += fCharSizeBuf[startInd];
00550     }
00551 
00552     //
00553     //  If there are spare chars, then move then down to the bottom. We
00554     //  have to move the char sizes down also.
00555     //
00556     startInd = 0;
00557     if (spareChars)
00558     {
00559         for (XMLSize_t index = fCharIndex; index < fCharsAvail; index++)
00560         {
00561             fCharBuf[startInd] = fCharBuf[index];
00562             fCharSizeBuf[startInd] = fCharSizeBuf[index];
00563             startInd++;
00564         }
00565     }
00566 
00567     //
00568     //  And then get more chars, starting after any spare chars that were
00569     //  left over from the last time.
00570     //
00571     fCharsAvail = xcodeMoreChars
00572     (
00573         &fCharBuf[startInd]
00574         , &fCharSizeBuf[startInd]
00575         , kCharBufSize - spareChars
00576     );
00577 
00578     // Add back in the spare chars
00579     fCharsAvail += spareChars;
00580 
00581     // Reset the buffer index to zero, so we start from the 0th char again
00582     fCharIndex = 0;
00583 
00584     //
00585     //  If no chars available, then we have to check for one last thing. If
00586     //  this is reader for a PE and its not being expanded inside a literal,
00587     //  then unget a trailing space. We use a boolean to avoid triggering
00588     //  this more than once.
00589     //
00590     if (!fCharsAvail
00591     &&  (fType == Type_PE)
00592     &&  (fRefFrom == RefFrom_NonLiteral)
00593     &&  !fSentTrailingSpace)
00594     {
00595         fCharBuf[0] = chSpace;
00596         fCharsAvail = 1;
00597         fSentTrailingSpace = true;
00598     }
00599 
00600     //
00601     //  If we get here with no more chars, then set the fNoMore flag which
00602     //  lets us optimize and know without checking that no more chars are
00603     //  available.
00604     //
00605     if (!fCharsAvail)
00606         fNoMore = true;
00607 
00608     //  Calculate fCharOfsBuf using the elements from fCharBufSize
00609     if (fCalculateSrcOfs)
00610     {
00611         unsigned int last = 0;
00612         fCharOfsBuf[0] = 0;
00613         for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
00614             fCharOfsBuf[index] = last+fCharSizeBuf[index-1];
00615             last = fCharOfsBuf[index];
00616             // code was:
00617             // fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
00618             // but on Solaris 64 bit with sun studio 11 this didn't work as
00619             // every value of fCharOfsBuf[] was 1.
00620         }
00621     }
00622 
00623     return (fCharsAvail != 0);
00624 }
00625 
00626 
00627 
00628 // ---------------------------------------------------------------------------
00629 //  XMLReader: Scanning methods
00630 // ---------------------------------------------------------------------------
00631 bool XMLReader::getName(XMLBuffer& toFill, const bool token)
00632 {
00633     //  Ok, first lets see if we have chars in the buffer. If not, then lets
00634     //  reload.
00635     if (fCharIndex == fCharsAvail)
00636     {
00637         if (!refreshCharBuffer())
00638             return false;
00639     }
00640 
00641     XMLSize_t charIndex_start = fCharIndex;
00642 
00643     //  Lets check the first char for being a first name char. If not, then
00644     //  what's the point in living mannnn? Just give up now. We only do this
00645     //  if its a name and not a name token that they want.
00646     if (!token)
00647     {
00648         if (fXMLVersion == XMLV1_1 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) {
00649            // make sure one more char is in the buffer, the transcoder
00650            // should put only a complete surrogate pair into the buffer
00651            assert(fCharIndex+1 < fCharsAvail);
00652            if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
00653                return false;
00654 
00655             // Looks ok, so lets eat it
00656             fCharIndex += 2;
00657         }
00658         else {
00659             if (!isFirstNameChar(fCharBuf[fCharIndex]))
00660                 return false;
00661 
00662             // Looks ok, so lets eat it
00663             fCharIndex ++;
00664         }
00665 
00666     }
00667 
00668     //  And now we loop until we run out of data in this reader or we hit
00669     //  a non-name char.
00670     while (true)
00671     {
00672         if (fXMLVersion == XMLV1_1)
00673         {
00674             while (fCharIndex < fCharsAvail)
00675             {
00676                 //  Check the current char and take it if its a name char. Else
00677                 //  break out.
00678                 if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
00679                 {
00680                     // make sure one more char is in the buffer, the transcoder
00681                     // should put only a complete surrogate pair into the buffer
00682                     assert(fCharIndex+1 < fCharsAvail);
00683                     if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
00684                          (fCharBuf[fCharIndex+1] > 0xDFFF)  )
00685                         break;
00686                     fCharIndex += 2;
00687 
00688                 }
00689                 else
00690                 {
00691                     if (!isNameChar(fCharBuf[fCharIndex]))
00692                         break;
00693                     fCharIndex++;
00694                 }
00695             }
00696         }
00697         else // XMLV1_0
00698         {
00699             while (fCharIndex < fCharsAvail)
00700             {
00701                 if (!isNameChar(fCharBuf[fCharIndex]))
00702                     break;
00703                 fCharIndex++;
00704             }
00705         }
00706 
00707         // we have to copy the accepted character(s), and update column
00708         if (fCharIndex != charIndex_start)
00709         {
00710             fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
00711             toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
00712         }
00713 
00714         // something is wrong if there is still something in the buffer
00715         // or if we don't get no more, then break out.
00716         if ((fCharIndex < fCharsAvail) ||
00717              !refreshCharBuffer())
00718             break;
00719 
00720         charIndex_start = fCharIndex;
00721     }
00722 
00723     return !toFill.isEmpty();
00724 }
00725 
00726 bool XMLReader::getNCName(XMLBuffer& toFill)
00727 {
00728     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
00729         return false;
00730 
00731     XMLSize_t charIndex_start = fCharIndex, count;
00732     //  Lets check the first char for being a first name char. If not, then
00733     //  what's the point in living mannnn? Just give up now. We only do this
00734     //  if its a name and not a name token that they want.
00735     if (fXMLVersion == XMLV1_1
00736         && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) {
00737         // make sure one more char is in the buffer, the transcoder
00738         // should put only a complete surrogate pair into the buffer
00739         assert(fCharIndex+1 < fCharsAvail);
00740         if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
00741             return false;
00742 
00743         // Looks ok, so lets eat it
00744         fCharIndex += 2;
00745     }
00746     else {
00747         if (!isFirstNCNameChar(fCharBuf[fCharIndex])) {
00748             return false;
00749         }
00750 
00751         // Looks ok, so lets eat it
00752         fCharIndex++;
00753     }
00754 
00755     do
00756     {
00757         if (fCharIndex == fCharsAvail)
00758         {
00759             // we have to copy the accepted character(s), and update the column number,
00760             // before getting new data and losing the value of fCharIndex
00761             if((count = fCharIndex - charIndex_start)!=0)
00762             {
00763                 fCurCol += (XMLFileLoc)count;
00764                 toFill.append(&fCharBuf[charIndex_start], count);
00765             }
00766             if(!refreshCharBuffer())
00767                 return true;
00768             charIndex_start = fCharIndex;
00769         }
00770 
00771         //  Check the current char and take it if it's a name char
00772         if (fXMLVersion == XMLV1_1)
00773         {
00774             while(fCharIndex < fCharsAvail)
00775             {
00776                 if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
00777                 else if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2;
00778                 else break;
00779             }
00780         }
00781         else
00782             while(fCharIndex < fCharsAvail && isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
00783         // if we didn't consume the entire buffer, we are done
00784     } while(fCharIndex == fCharsAvail);
00785 
00786     // we have to copy the accepted character(s), and update column
00787     if((count = fCharIndex - charIndex_start)!=0)
00788     {
00789         fCurCol += (XMLFileLoc)count;
00790         toFill.append(&fCharBuf[charIndex_start], count);
00791     }
00792     return true;
00793 }
00794 
00795 bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition)
00796 {
00797     // We are only looking for two iterations (i.e. 'NCNAME':'NCNAME').
00798     // We will stop when we finished scanning for a QName (i.e. either a second
00799     // colon or an invalid char).
00800     if(!getNCName(toFill))
00801     {
00802         *colonPosition = -1;
00803         return false;
00804     }
00805     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
00806     {
00807         *colonPosition = -1;
00808         return true;
00809     }
00810     if (fCharBuf[fCharIndex] != chColon)
00811     {
00812         *colonPosition = -1;
00813         return true;
00814     }
00815 
00816     *colonPosition = (int)toFill.getLen();
00817     toFill.append(chColon);
00818     fCharIndex++;
00819     fCurCol++;
00820     return getNCName(toFill);
00821 }
00822 
00823 bool XMLReader::getSpaces(XMLBuffer& toFill)
00824 {
00825     //
00826     //  We just loop until we either hit a non-space or the end of this
00827     //  entity. We return true if we returned because of a non-space and
00828     //  false if because of end of entity.
00829     //
00830     //  NOTE:   We have to maintain line/col info here and we have to do
00831     //          whitespace normalization if we are not already internalized.
00832     //
00833     while (true)
00834     {
00835         // Loop through the current chars in the buffer
00836         while (fCharIndex < fCharsAvail)
00837         {
00838             // Get the current char out of the buffer
00839             XMLCh curCh = fCharBuf[fCharIndex];
00840 
00841             //
00842             //  See if its a white space char. If so, then process it. Else
00843             //  we've hit a non-space and need to return.
00844             //
00845             if (isWhitespace(curCh))
00846             {
00847                 // Eat this char
00848                 fCharIndex++;
00849 
00850                 //
00851                 //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
00852                 //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
00853                 //
00854                 //  100000 x20
00855                 //  001001 x9
00856                 //  001010 chLF
00857                 //  001101 chCR
00858                 //  -----------
00859                 //  000110 == (chCR|chLF) & ~(0x9|0x20)
00860                 //
00861                 //  if the result of thelogical-& operation is
00862                 //  true  : 'curCh' must be xA  or xD
00863                 //  false : 'curCh' must be x20 or x9
00864                 //
00865                 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
00866                 {
00867                     fCurCol++;
00868                 } else
00869                 {
00870                     handleEOL(curCh, false);
00871                 }
00872 
00873                 // Ok we can add this guy to our buffer
00874                 toFill.append(curCh);
00875             }
00876              else
00877             {
00878                 // Return true to indicate we broke out due to a whitespace
00879                 return true;
00880             }
00881         }
00882 
00883         //
00884         //  We've eaten up the current buffer, so lets try to reload it. If
00885         //  we don't get anything new, then break out. If we do, then we go
00886         //  back to the top to keep getting spaces.
00887         //
00888         if (!refreshCharBuffer())
00889             break;
00890     }
00891     return false;
00892 }
00893 
00894 
00895 bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
00896 {
00897     while (true)
00898     {
00899         // Loop through the current chars in the buffer
00900         while (fCharIndex < fCharsAvail)
00901         {
00902             // Get the current char out of the buffer
00903             XMLCh curCh = fCharBuf[fCharIndex];
00904 
00905             //
00906             //  See if its not a white space or our target char, then process
00907             //  it. Else, we need to return.
00908             //
00909             if (!isWhitespace(curCh) && (curCh != toCheck))
00910             {
00911                 // Eat this char
00912                 fCharIndex++;
00913 
00914                 //
00915                 //  'curCh' is not a whitespace(x20|x9|xD|xA), so we only can
00916                 //  have end-of-line combinations with a leading chNEL(x85) or
00917                 //  chLineSeparator(x2028)
00918                 //
00919                 //  0010000000101000 chLineSeparator
00920                 //  0000000010000101 chNEL
00921                 //  ---------------------
00922                 //  1101111101010010 == ~(chNEL|chLineSeparator)
00923                 //
00924                 //  if the result of the logical-& operation is
00925                 //  true  : 'curCh' can not be chNEL or chLineSeparator
00926                 //  false : 'curCh' can be chNEL or chLineSeparator
00927                 //
00928                 if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) )
00929                 {
00930                     fCurCol++;
00931                 } else
00932                 {
00933                     handleEOL(curCh, false);
00934                 }
00935 
00936                 // Add it to our buffer
00937                 toFill.append(curCh);
00938             }
00939              else
00940             {
00941                 return true;
00942             }
00943         }
00944 
00945         //
00946         //  We've eaten up the current buffer, so lets try to reload it. If
00947         //  we don't get anything new, then break out. If we do, then we go
00948         //  back to the top to keep getting spaces.
00949         //
00950         if (!refreshCharBuffer())
00951             break;
00952     }
00953 
00954     // We never hit any non-space and ate up the whole reader
00955     return false;
00956 
00957 }
00958 
00959 bool XMLReader::skipIfQuote(XMLCh& chGotten)
00960 {
00961     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
00962         return false;
00963 
00964     chGotten = fCharBuf[fCharIndex];
00965     if ((chGotten == chDoubleQuote) || (chGotten == chSingleQuote))
00966     {
00967         fCharIndex++;
00968         fCurCol++;
00969         return true;
00970     }
00971     return false;
00972 }
00973 
00974 
00975 bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl)
00976 {
00977     //  DO NOT set the skippedSomething to 'false', but change it to be 'true' only
00978 
00979     //  We enter a loop where we skip over spaces until we hit the end of
00980     //  this reader or a non-space value. The return indicates whether we
00981     //  hit the non-space (true) or the end (false).
00982     do
00983     {
00984         // Loop through the current chars in the buffer
00985         while (fCharIndex < fCharsAvail)
00986         {
00987             //  See if its a white space char. If so, then process it. Else
00988             //  we've hit a non-space and need to return.
00989             if (isWhitespace(fCharBuf[fCharIndex]))
00990             {
00991                 // Get the current char out of the buffer and eat it
00992                 XMLCh curCh = fCharBuf[fCharIndex++];
00993                 skippedSomething = true;
00994                 //
00995                 //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
00996                 //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
00997                 //
00998                 //  100000 x20
00999                 //  001001 x9
01000                 //  001010 chLF
01001                 //  001101 chCR
01002                 //  -----------
01003                 //  000110 == (chCR|chLF) & ~(0x9|0x20)
01004                 //
01005                 //  if the result of the logical-& operation is
01006                 //  true  : 'curCh' must be xA  or xD
01007                 //  false : 'curCh' must be x20 or x9
01008                 //
01009                 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
01010                 {
01011                     fCurCol++;
01012                 } else
01013                 {
01014                     handleEOL(curCh, inDecl);
01015                 }
01016             }
01017             else
01018                 return true;
01019         }
01020 
01021         //  We've eaten up the current buffer, so lets try to reload it. If
01022         //  we don't get anything new, then break out. If we do, then we go
01023         //  back to the top to keep getting spaces.
01024     } while(refreshCharBuffer());
01025 
01026     // We never hit any non-space and ate up the whole reader
01027     return false;
01028 }
01029 
01030 bool XMLReader::skippedChar(const XMLCh toSkip)
01031 {
01032     //
01033     //  If the buffer is empty, then try to reload it. If we still get
01034     //  nothing, then return false.
01035     //
01036     if (fCharIndex == fCharsAvail)
01037     {
01038         if (!refreshCharBuffer())
01039             return false;
01040     }
01041 
01042     //
01043     //  See if the current char is the one we want. If so, then we need
01044     //  to eat it and return true.
01045     //
01046     if (fCharBuf[fCharIndex] == toSkip)
01047     {
01048         fCharIndex++;
01049         fCurCol++;
01050         return true;
01051     }
01052     return false;
01053 }
01054 
01055 
01056 bool XMLReader::skippedSpace()
01057 {
01058     //
01059     //  If the buffer is empty, then try to reload it. If we still get
01060     //  nothing, then return false.
01061     //
01062     if (fCharIndex == fCharsAvail)
01063     {
01064         if (!refreshCharBuffer())
01065             return false;
01066     }
01067 
01068     //
01069     //  See if the current char is a whitespace. If so, then we need to eat
01070     //  it and return true.
01071     //
01072     const XMLCh curCh = fCharBuf[fCharIndex];
01073     if (isWhitespace(curCh))
01074     {
01075         // Eat the character
01076         fCharIndex++;
01077 
01078         //
01079         //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
01080         //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
01081         //
01082         //  100000 x20
01083         //  001001 x9
01084         //  001010 chLF
01085         //  001101 chCR
01086         //  -----------
01087         //  000110 == (chCR|chLF) & ~(0x9|0x20)
01088         //
01089         //  if the result of the logical-& operation is
01090         //  true  : 'curCh' must be xA  or xD
01091         //  false : 'curCh' must be x20 or x9
01092         //
01093         if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
01094         {
01095             fCurCol++;
01096         } else
01097         {
01098             handleEOL((XMLCh&)curCh, false);
01099         }
01100 
01101         return true;
01102     }
01103     return false;
01104 }
01105 
01106 bool XMLReader::skippedString(const XMLCh* const toSkip)
01107 {
01108     // This function works on strings that are smaller than kCharBufSize.
01109     // This function guarantees that in case the comparison is unsuccessful
01110     // the fCharIndex will point to the original data.
01111     //
01112 
01113     // Get the length of the string to skip.
01114     //
01115     const XMLSize_t srcLen = XMLString::stringLen(toSkip);
01116     XMLSize_t charsLeft = charsLeftInBuffer();
01117 
01118     //  See if the current reader has enough chars to test against this
01119     //  string. If not, then ask it to reload its buffer. If that does not
01120     //  get us enough, then it cannot match.
01121     //
01122     //  NOTE: This works because strings never have to cross a reader! And
01123     //  a string to skip will never have a new line in it, so we will never
01124     //  miss adjusting the current line.
01125     //
01126     while (charsLeft < srcLen)
01127     {
01128       if (!refreshCharBuffer())
01129         return false;
01130 
01131       XMLSize_t tmp = charsLeftInBuffer();
01132       if (tmp == charsLeft) // if the refreshCharBuf() did not add anything new
01133         return false;     // give up and return.
01134 
01135       charsLeft = tmp;
01136     }
01137 
01138     //  Ok, now we now that the current reader has enough chars in its
01139     //  buffer and that its index is back at zero. So we can do a quick and
01140     //  dirty comparison straight to its buffer with no requirement to unget
01141     //  if it fails.
01142     //
01143     if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen * sizeof(XMLCh)))
01144       return false;
01145 
01146     // Add the source length to the current column to get it back right.
01147     //
01148     fCurCol += (XMLFileLoc)srcLen;
01149 
01150     //  And get the character buffer index back right by just adding the
01151     //  source len to it.
01152     //
01153     fCharIndex += srcLen;
01154 
01155     return true;
01156 }
01157 
01158 bool XMLReader::skippedStringLong(const XMLCh* toSkip)
01159 {
01160     // This function works on strings that are potentially longer than
01161     // kCharBufSize (e.g., end tag). This function does not guarantee
01162     // that in case the comparison is unsuccessful the fCharIndex will
01163     // point to the original data.
01164     //
01165 
01166     XMLSize_t srcLen = XMLString::stringLen(toSkip);
01167     XMLSize_t charsLeft = charsLeftInBuffer();
01168 
01169     while (srcLen != 0)
01170     {
01171       // Fill up the buffer with as much data as possible.
01172       //
01173       while (charsLeft < srcLen && charsLeft != kCharBufSize)
01174       {
01175         if (!refreshCharBuffer())
01176           return false;
01177 
01178         XMLSize_t tmp = charsLeftInBuffer();
01179         if (tmp == charsLeft) // if the refreshCharBuf() did not add anything
01180           return false;       // new give up and return.
01181 
01182         charsLeft = tmp;
01183       }
01184 
01185       XMLSize_t n = charsLeft < srcLen ? charsLeft : srcLen;
01186 
01187       if (memcmp(&fCharBuf[fCharIndex], toSkip, n * sizeof(XMLCh)))
01188         return false;
01189 
01190       toSkip += n;
01191       srcLen -= n;
01192 
01193       fCharIndex += n;
01194       fCurCol += (XMLFileLoc)n;
01195       charsLeft -= n;
01196     }
01197 
01198     return true;
01199 }
01200 
01201 //
01202 // This is just to peek if the next coming buffer
01203 // matches the string toPeek.
01204 // Similar to skippedString, but just the fCharIndex and fCurCol are not updated
01205 //
01206 bool XMLReader::peekString(const XMLCh* const toPeek)
01207 {
01208     // Get the length of the string to skip
01209     const XMLSize_t srcLen = XMLString::stringLen(toPeek);
01210 
01211     //
01212     //  See if the current reader has enough chars to test against this
01213     //  string. If not, then ask it to reload its buffer. If that does not
01214     //  get us enough, then it cannot match.
01215     //
01216     //  NOTE: This works because strings never have to cross a reader! And
01217     //  a string to skip will never have a new line in it, so we will never
01218     //  miss adjusting the current line.
01219     //
01220     XMLSize_t charsLeft = charsLeftInBuffer();
01221     while (charsLeft < srcLen)
01222     {
01223          refreshCharBuffer();
01224          XMLSize_t t = charsLeftInBuffer();
01225          if (t == charsLeft)   // if the refreshCharBuf() did not add anything new
01226              return false;     //   give up and return.
01227          charsLeft = t;
01228         }
01229 
01230 
01231 
01232 
01233     //
01234     //  Ok, now we now that the current reader has enough chars in its
01235     //  buffer and that its index is back at zero. So we can do a quick and
01236     //  dirty comparison straight to its buffer with no requirement to unget
01237     //  if it fails.
01238     //
01239     if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh)))
01240         return false;
01241 
01242     return true;
01243 }
01244 
01245 
01246 // ---------------------------------------------------------------------------
01247 //  XMLReader: Setter methods (most are inlined)
01248 // ---------------------------------------------------------------------------
01249 bool XMLReader::setEncoding(const XMLCh* const newEncoding)
01250 {
01251     //
01252     //  If the encoding was forced, then we ignore the new value and just
01253     //  return with success. If it was forced, then we are to use that
01254     //  encoding without question. Note that, if we are forced, we created
01255     //  a transcoder up front so there is no need to do one here in that
01256     //  case.
01257     //
01258     if (fForcedEncoding)
01259         return true;
01260 
01261     //
01262     // upperCase the newEncoding first for better performance
01263     //
01264     XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
01265     XMLString::upperCaseASCII(inputEncoding);
01266 
01267     XMLRecognizer::Encodings newBaseEncoding;
01268     //
01269     //  Check for non-endian specific UTF-16 or UCS-4. If so, and if we
01270     //  are already in one of the endian versions of those encodings,
01271     //  then just keep it and go on. Otherwise, its not valid.
01272     //
01273     if (XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString)
01274     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString2)
01275     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString3)
01276     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString4)
01277     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString5)
01278     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString6)
01279     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString7))
01280     {
01281         fMemoryManager->deallocate(inputEncoding);
01282 
01283         if ((fEncoding != XMLRecognizer::UTF_16L)
01284         &&  (fEncoding != XMLRecognizer::UTF_16B))
01285         {
01286             return false;
01287         }
01288 
01289         // Override with the original endian specific encoding
01290         newBaseEncoding = fEncoding;
01291 
01292         if (fEncoding == XMLRecognizer::UTF_16L) {
01293             fMemoryManager->deallocate(fEncodingStr);
01294             fEncodingStr = 0;
01295             fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
01296         }
01297         else {
01298             fMemoryManager->deallocate(fEncodingStr);
01299             fEncodingStr = 0;
01300             fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
01301         }
01302     }
01303     else if (XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString)
01304          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString2)
01305          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString3)
01306          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString4)
01307          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString5))
01308     {
01309         fMemoryManager->deallocate(inputEncoding);
01310 
01311         if ((fEncoding != XMLRecognizer::UCS_4L)
01312         &&  (fEncoding != XMLRecognizer::UCS_4B))
01313         {
01314             return false;
01315         }
01316 
01317         // Override with the original endian specific encoding
01318         newBaseEncoding = fEncoding;
01319 
01320         if (fEncoding == XMLRecognizer::UCS_4L) {
01321 
01322             fMemoryManager->deallocate(fEncodingStr);
01323             fEncodingStr = 0;
01324             fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
01325         }
01326         else {
01327 
01328             fMemoryManager->deallocate(fEncodingStr);
01329             fEncodingStr = 0;
01330             fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
01331         }
01332     }
01333      else
01334     {
01335         //
01336         //  Try to map the string to one of our standard encodings. If its not
01337         //  one of them, then it has to be one of the non-intrinsic encodings,
01338         //  in which case we have to delete our intrinsic encoder and create a
01339         //  new one.
01340         //
01341         newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding);
01342 
01343         //
01344         //  If it does not come back as one of the auto-sensed encodings, then we
01345         //  have to possibly replace it and at least check a few things.
01346         //
01347         if (newBaseEncoding == XMLRecognizer::OtherEncoding)
01348         {
01349             //
01350             // We already know it's none of those non-endian special cases,
01351             // so just replicate the new name and use it directly to create the transcoder
01352             //
01353             fMemoryManager->deallocate(fEncodingStr);
01354             fEncodingStr = inputEncoding;
01355 
01356             XMLTransService::Codes failReason;
01357             fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
01358             (
01359                 fEncodingStr
01360                 , failReason
01361                 , kCharBufSize
01362                 , fMemoryManager
01363             );
01364         }
01365         else
01366         {
01367             // Store the new encoding string since it is just an intrinsic
01368             fMemoryManager->deallocate(fEncodingStr);
01369             fEncodingStr = inputEncoding;
01370         }
01371     }
01372 
01373     if (!fTranscoder) {
01374         //
01375         //  Now we can create a transcoder using the recognized fEncoding.  We
01376         //  might get back a transcoder for an intrinsically supported encoding,
01377         //  or we might get one from the underlying transcoding service.
01378         //
01379         XMLTransService::Codes failReason;
01380         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
01381         (
01382             newBaseEncoding
01383             , failReason
01384             , kCharBufSize
01385             , fMemoryManager
01386         );
01387 
01388         if (!fTranscoder)
01389             ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
01390     }
01391 
01392     // Update the base encoding member with the new base encoding found
01393     fEncoding = newBaseEncoding;
01394 
01395     // Looks ok to us
01396     return true;
01397 }
01398 
01399 
01400 // ---------------------------------------------------------------------------
01401 //  XMLReader: Private helper methods
01402 // ---------------------------------------------------------------------------
01403 
01404 //
01405 //  This is called when the encoding flag is set and just sets the fSwapped
01406 //  flag appropriately.
01407 //
01408 void XMLReader::checkForSwapped()
01409 {
01410     // Assume not swapped
01411     fSwapped = false;
01412 
01413         if (XMLPlatformUtils::fgXMLChBigEndian)
01414         {
01415         if ((fEncoding == XMLRecognizer::UTF_16L)
01416         ||  (fEncoding == XMLRecognizer::UCS_4L))
01417         {
01418             fSwapped = true;
01419         }
01420     }
01421     else
01422     {
01423         if ((fEncoding == XMLRecognizer::UTF_16B)
01424         ||  (fEncoding == XMLRecognizer::UCS_4B))
01425         {
01426             fSwapped = true;
01427         }
01428     }
01429 }
01430 
01431 
01432 //
01433 //  This is called from the constructor when the encoding is not forced.
01434 //  We assume that the encoding has been auto-sensed at this point and that
01435 //  fSwapped is set correctly.
01436 //
01437 //  In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
01438 //  The fact that we got here, means that there is one, because that's the
01439 //  only way we can autosense those.
01440 //
01441 void XMLReader::doInitDecode()
01442 {
01443     switch(fEncoding)
01444     {
01445         case XMLRecognizer::UCS_4B :
01446         case XMLRecognizer::UCS_4L :
01447         {
01448             // Remove bom if any
01449             if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
01450                 ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )
01451             {
01452                 for (XMLSize_t i = 0; i < fRawBytesAvail; i++)
01453                     fRawByteBuf[i] = fRawByteBuf[i+4];
01454 
01455                 fRawBytesAvail -=4;
01456             }
01457 
01458             // Look at the raw buffer as UCS4 chars
01459             const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;
01460 
01461             while (fRawBufIndex < fRawBytesAvail)
01462             {
01463                 // Get out the current 4 byte value and inc our raw buf index
01464                 UCS4Ch curVal = *asUCS++;
01465                 fRawBufIndex += sizeof(UCS4Ch);
01466 
01467                 // Swap if that is required for this machine
01468                 if (fSwapped)
01469                     curVal = BitOps::swapBytes(curVal);
01470 
01471                 // Make sure its at least semi legal. If not, undo and throw
01472                 if (curVal > 0xFFFF)
01473                 {
01474                     fCharsAvail = 0;
01475                     fRawBufIndex = 0;
01476                     fMemoryManager->deallocate(fPublicId);
01477                     fMemoryManager->deallocate(fEncodingStr);
01478                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
01479                     ThrowXMLwithMemMgr1
01480                     (
01481                         TranscodingException
01482                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
01483                         , fSystemId
01484                         , fMemoryManager
01485                     );
01486                 }
01487 
01488                 // Convert the value to an XML char and store it
01489                 fCharSizeBuf[fCharsAvail] = 4;
01490                 fCharBuf[fCharsAvail++] = XMLCh(curVal);
01491 
01492                 // Break out on the > character
01493                 if (curVal == chCloseAngle)
01494                     break;
01495             }
01496             break;
01497         }
01498 
01499         case XMLRecognizer::UTF_8 :
01500         {
01501             // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.
01502             //   Don't move to char buf - no one wants to see it.
01503             //   Note: this causes any encoding= declaration to override
01504             //         the BOM's attempt to say that the encoding is utf-8.
01505 
01506             // Look at the raw buffer as short chars
01507             const char* asChars = (const char*)fRawByteBuf;
01508 
01509             if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
01510                 XMLString::compareNString(  asChars
01511                                             , XMLRecognizer::fgUTF8BOM
01512                                             , XMLRecognizer::fgUTF8BOMLen) == 0)
01513             {
01514                 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
01515                 asChars      += XMLRecognizer::fgUTF8BOMLen;
01516             }
01517 
01518             //
01519             //  First check that there are enough bytes to even see the
01520             //  decl indentifier. If not, get out now with no action since
01521             //  there is no decl.
01522             //
01523             if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
01524                 break;
01525 
01526             // Check for the opening sequence. If not, then no decl
01527             if (XMLString::compareNString(  asChars
01528                                             , XMLRecognizer::fgASCIIPre
01529                                             , XMLRecognizer::fgASCIIPreLen))
01530             {
01531                 break;
01532             }
01533 
01534             while (fRawBufIndex < fRawBytesAvail)
01535             {
01536                 const char curCh = *asChars++;
01537                 fRawBufIndex++;
01538 
01539                 // Looks ok, so store it
01540                 fCharSizeBuf[fCharsAvail] = 1;
01541                 fCharBuf[fCharsAvail++] = XMLCh(curCh);
01542 
01543                 // Break out on a > character
01544                 if (curCh == chCloseAngle)
01545                     break;
01546 
01547                 //
01548                 //  A char greater than 0x7F is not allowed in this case. If
01549                 //  so, undo and throw.
01550                 //
01551                 if (curCh & 0x80)
01552                 {
01553                     fCharsAvail = 0;
01554                     fRawBufIndex = 0;
01555                     fMemoryManager->deallocate(fPublicId);
01556                     fMemoryManager->deallocate(fEncodingStr);
01557                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
01558                     ThrowXMLwithMemMgr1
01559                     (
01560                         TranscodingException
01561                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
01562                         , fSystemId
01563                         , fMemoryManager
01564                     );
01565                 }
01566             }
01567             break;
01568         }
01569 
01570         case XMLRecognizer::UTF_16B :
01571         case XMLRecognizer::UTF_16L :
01572         {
01573             //
01574             //  If there is a decl here, we just truncate back the characters
01575             //  as we go. No surrogate creation would be allowed here in legal
01576             //  XML, so we consider it a transoding error if we find one.
01577             //
01578             if (fRawBytesAvail < 2)
01579                 break;
01580 
01581             XMLSize_t postBOMIndex = 0;
01582             const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
01583             if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
01584             {
01585                 fRawBufIndex += sizeof(UTF16Ch);
01586                 asUTF16++;
01587                 postBOMIndex = fRawBufIndex;
01588             }
01589 
01590             //  First check that there are enough raw bytes for there to even
01591             //  be a decl indentifier. If not, then nothing to do.
01592             //
01593             if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
01594             {
01595                 fRawBufIndex = postBOMIndex;
01596                 break;
01597             }
01598 
01599             //
01600             //  See we get a match on the prefix. If not, then reset and
01601             //  break out.
01602             //
01603             if (fEncoding == XMLRecognizer::UTF_16B)
01604             {
01605                 if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
01606                 {
01607                     fRawBufIndex = postBOMIndex;
01608                     break;
01609                 }
01610             }
01611              else
01612             {
01613                 if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
01614                 {
01615                     fRawBufIndex = postBOMIndex;
01616                     break;
01617                 }
01618             }
01619 
01620             while (fRawBufIndex < fRawBytesAvail)
01621             {
01622                 // Get out the current 2 byte value
01623                 UTF16Ch curVal = *asUTF16++;
01624                 fRawBufIndex += sizeof(UTF16Ch);
01625 
01626                 // Swap if that is required for this machine
01627                 if (fSwapped)
01628                     curVal = BitOps::swapBytes(curVal);
01629 
01630                 //
01631                 //  Store it and bump the target index, implicitly converting
01632                 //  if UTF16Ch and XMLCh are not the same size.
01633                 //
01634                 fCharSizeBuf[fCharsAvail] = 2;
01635                 fCharBuf[fCharsAvail++] = curVal;
01636 
01637                 // Break out on a > char
01638                 if (curVal == chCloseAngle)
01639                     break;
01640             }
01641             break;
01642         }
01643 
01644         case XMLRecognizer::EBCDIC :
01645         {
01646             //
01647             //  We use special support in the intrinsic EBCDIC-US transcoder
01648             //  to go through one char at a time.
01649             //
01650             const XMLByte* srcPtr = fRawByteBuf;
01651             while (1)
01652             {
01653                 // Transcode one char from the source
01654                 const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
01655                 fRawBufIndex++;
01656 
01657                 //
01658                 //  And put it into the character buffer. This stuff has to
01659                 //  look like it was normally transcoded.
01660                 //
01661                 fCharSizeBuf[fCharsAvail] = 1;
01662                 fCharBuf[fCharsAvail++] = chCur;
01663 
01664                 // If its a > char, then break out
01665                 if (chCur == chCloseAngle)
01666                     break;
01667 
01668                 // Watch for using up all input and get out
01669                 if (fRawBufIndex == fRawBytesAvail)
01670                     break;
01671             }
01672             break;
01673         }
01674 
01675         default :
01676             // It should never be anything else here
01677             fMemoryManager->deallocate(fPublicId);
01678             fMemoryManager->deallocate(fEncodingStr);
01679             fMemoryManager->deallocate(fSystemId);
01680             ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
01681             break;
01682     }
01683 
01684     //
01685     //  Ok, by the time we get here, if its a legal XML file we have eaten
01686     //  the XML/TextDecl. So, if we are a PE and are being referenced from
01687     //  outside a literal, then we need to throw in an arbitrary space that
01688     //  is required by XML.
01689     //
01690     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
01691         fCharBuf[fCharsAvail++] = chSpace;
01692 
01693     //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize
01694     if (fCalculateSrcOfs)
01695     {
01696         fCharOfsBuf[0] = 0;
01697         for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
01698             fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
01699         }
01700     }
01701 }
01702 
01703 
01704 //
01705 //  This method is called internally when we run out of bytes in the raw
01706 //  buffer. We just read as many bytes as we can into the raw buffer again
01707 //  and store the number of bytes we got.
01708 //
01709 void XMLReader::refreshRawBuffer()
01710 {
01711     //
01712     //  If there are any bytes left, move them down to the start. There
01713     //  should only ever be (max bytes per char - 1) at the most.
01714     //
01715     const XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;
01716 
01717     // Move the existing ones down
01718     for (XMLSize_t index = 0; index < bytesLeft; index++)
01719         fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];
01720 
01721     //
01722     //  And then read into the buffer past the existing bytes. Add back in
01723     //  that many to the bytes read, and subtract that many from the bytes
01724     //  requested.
01725     //
01726     fRawBytesAvail = fStream->readBytes
01727     (
01728         &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
01729     ) + bytesLeft;
01730 
01731     //
01732     //  We need to reset the buffer index back to the start in all cases,
01733     //  since any trailing data was copied down to the start.
01734     //
01735     fRawBufIndex = 0;
01736 }
01737 
01738 
01739 //
01740 //  This method is called internally when we run out of characters in the
01741 //  trancoded character buffer. We transcode up to another maxChars chars
01742 //  from the
01743 //
01744 XMLSize_t
01745 XMLReader::xcodeMoreChars(          XMLCh* const            bufToFill
01746                             ,       unsigned char* const    charSizes
01747                             , const XMLSize_t               maxChars)
01748 {
01749     XMLSize_t charsDone = 0;
01750     XMLSize_t bytesEaten = 0;
01751     bool needMode = false;
01752 
01753     while (!bytesEaten)
01754     {
01755         // If our raw buffer is low, then lets load up another batch of
01756         // raw bytes now.
01757         //
01758         XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;
01759         if (needMode || bytesLeft == 0 || bytesLeft < fLowWaterMark)
01760         {
01761             refreshRawBuffer();
01762 
01763             // If there are no characters or if we need more but didn't get
01764             // any, return zero now.
01765             //
01766             if (fRawBytesAvail == 0 ||
01767                 (needMode && (bytesLeft == fRawBytesAvail - fRawBufIndex)))
01768                 return 0;
01769         }
01770 
01771         // Ask the transcoder to internalize another batch of chars. It is
01772         // possible that there is data in the raw buffer but the transcoder
01773         // is unable to produce anything because transcoding of multi-byte
01774         // encodings may have left a few bytes representing a partial
01775         // character in the buffer that can't be used until the next chunk
01776         // (and the rest of the character) is read. In this case set the
01777         // needMore flag and try again.
01778         //
01779 
01780         charsDone = fTranscoder->transcodeFrom
01781           (
01782             &fRawByteBuf[fRawBufIndex]
01783             , fRawBytesAvail - fRawBufIndex
01784             , bufToFill
01785             , maxChars
01786             , bytesEaten
01787             , charSizes
01788           );
01789 
01790         if (bytesEaten == 0)
01791             needMode = true;
01792         else
01793             fRawBufIndex += bytesEaten;
01794     }
01795 
01796     return charsDone;
01797 }
01798 
01799 /***
01800  *
01801  * XML1.1
01802  *
01803  * 2.11 End-of-Line Handling
01804  *
01805  *    XML parsed entities are often stored in computer files which, for editing
01806  *    convenience, are organized into lines. These lines are typically separated
01807  *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
01808  *
01809  *    To simplify the tasks of applications, the XML processor MUST behave as if
01810  *    it normalized all line breaks in external parsed entities (including the document
01811  *    entity) on input, before parsing, by translating all of the following to a single
01812  *    #xA character:
01813  *
01814  *  1. the two-character sequence #xD #xA
01815  *  2. the two-character sequence #xD #x85
01816  *  3. the single character #x85
01817  *  4. the single character #x2028
01818  *  5. any #xD character that is not immediately followed by #xA or #x85.
01819  *
01820  *
01821  ***/
01822 void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
01823 {
01824     // 1. the two-character sequence #xD #xA
01825     // 2. the two-character sequence #xD #x85
01826     // 5. any #xD character that is not immediately followed by #xA or #x85.
01827     switch(curCh)
01828     {
01829     case chCR:
01830         fCurCol = 1;
01831         fCurLine++;
01832 
01833         //
01834         //  If not already internalized, then convert it to an
01835         //  LF and eat any following LF.
01836         //
01837         if (fSource == Source_External)
01838         {
01839             if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
01840             {
01841                 if ( fCharBuf[fCharIndex] == chLF              ||
01842                     ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
01843                 {
01844                     fCharIndex++;
01845                 }
01846             }
01847             curCh = chLF;
01848         }
01849         break;
01850 
01851     case chLF:
01852         fCurCol = 1;
01853         fCurLine++;
01854         break;
01855 
01856     // 3. the single character #x85
01857     // 4. the single character #x2028
01858     case chNEL:
01859     case chLineSeparator:
01860         if (inDecl && fXMLVersion == XMLV1_1)
01861         {
01862 
01863         /***
01864          * XML1.1
01865          *
01866          * 2.11 End-of-Line Handling
01867          *  ...
01868          *   The characters #x85 and #x2028 cannot be reliably recognized and translated
01869          *   until an entity's encoding declaration (if present) has been read.
01870          *   Therefore, it is a fatal error to use them within the XML declaration or
01871          *   text declaration.
01872          *
01873          ***/
01874             ThrowXMLwithMemMgr1
01875                 (
01876                 TranscodingException
01877                 , XMLExcepts::Reader_NelLsepinDecl
01878                 , fSystemId
01879                 , fMemoryManager
01880                 );
01881         }
01882 
01883         if (fNEL && fSource == Source_External)
01884         {
01885             fCurCol = 1;
01886             fCurLine++;
01887             curCh = chLF;
01888         }
01889         break;
01890     default:
01891         fCurCol++;
01892     }
01893 }
01894 
01895 XERCES_CPP_NAMESPACE_END