GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: XMLReader.cpp 901280 2010-01-20 17:06:14Z johns $ 00020 */ 00021 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/internal/XMLReader.hpp> 00026 #include <xercesc/util/BitOps.hpp> 00027 #include <xercesc/util/BinInputStream.hpp> 00028 #include <xercesc/util/PlatformUtils.hpp> 00029 #include <xercesc/util/RuntimeException.hpp> 00030 #include <xercesc/util/TransService.hpp> 00031 #include <xercesc/util/XMLEBCDICTranscoder.hpp> 00032 #include <xercesc/util/XMLString.hpp> 00033 #include <xercesc/util/Janitor.hpp> 00034 00035 XERCES_CPP_NAMESPACE_BEGIN 00036 00037 // --------------------------------------------------------------------------- 00038 // XMLReader: Query Methods 00039 // --------------------------------------------------------------------------- 00040 // Checks whether all of the chars in the passed buffer are whitespace or 00041 // not. Breaks out on the first non-whitespace. 00042 // 00043 bool XMLReader::isAllSpaces(const XMLCh* const toCheck 00044 , const XMLSize_t count) const 00045 { 00046 const XMLCh* curCh = toCheck; 00047 const XMLCh* endPtr = toCheck + count; 00048 while (curCh < endPtr) 00049 { 00050 if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask)) 00051 return false; 00052 } 00053 return true; 00054 } 00055 00056 00057 // 00058 // Checks whether at least one of the chars in the passed buffer are whitespace or 00059 // not. 00060 // 00061 bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck 00062 , const XMLSize_t count) const 00063 { 00064 const XMLCh* curCh = toCheck; 00065 const XMLCh* endPtr = toCheck + count; 00066 while (curCh < endPtr) 00067 { 00068 if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask) 00069 return true; 00070 } 00071 return false; 00072 } 00073 00074 // 00075 // This one is not called terribly often, so call the XMLChar utility 00076 // 00077 bool XMLReader::isPublicIdChar(const XMLCh toCheck) const 00078 { 00079 if (fXMLVersion == XMLV1_1) 00080 return XMLChar1_1::isPublicIdChar(toCheck); 00081 else 00082 return XMLChar1_0::isPublicIdChar(toCheck); 00083 } 00084 00085 // --------------------------------------------------------------------------- 00086 // XMLReader: Constructors and Destructor 00087 // --------------------------------------------------------------------------- 00088 XMLReader::XMLReader(const XMLCh* const pubId 00089 , const XMLCh* const sysId 00090 , BinInputStream* const streamToAdopt 00091 , const RefFrom from 00092 , const Types type 00093 , const Sources source 00094 , const bool throwAtEnd 00095 , const bool calculateSrcOfs 00096 , XMLSize_t lowWaterMark 00097 , const XMLVersion version 00098 , MemoryManager* const manager) : 00099 fCharIndex(0) 00100 , fCharsAvail(0) 00101 , fCurCol(1) 00102 , fCurLine(1) 00103 , fEncodingStr(0) 00104 , fForcedEncoding(false) 00105 , fNoMore(false) 00106 , fPublicId(XMLString::replicate(pubId, manager)) 00107 , fRawBufIndex(0) 00108 , fRawBytesAvail(0) 00109 , fLowWaterMark (lowWaterMark) 00110 , fReaderNum(0xFFFFFFFF) 00111 , fRefFrom(from) 00112 , fSentTrailingSpace(false) 00113 , fSource(source) 00114 , fSrcOfsBase(0) 00115 , fSrcOfsSupported(false) 00116 , fCalculateSrcOfs(calculateSrcOfs) 00117 , fSystemId(XMLString::replicate(sysId, manager)) 00118 , fStream(streamToAdopt) 00119 , fSwapped(false) 00120 , fThrowAtEnd(throwAtEnd) 00121 , fTranscoder(0) 00122 , fType(type) 00123 , fMemoryManager(manager) 00124 { 00125 setXMLVersion(version); 00126 00127 // Do an initial load of raw bytes 00128 refreshRawBuffer(); 00129 00130 // Ask the transcoding service if it supports src offset info 00131 fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); 00132 00133 // 00134 // Use the recognizer class to get a basic sense of what family of 00135 // encodings this file is in. We'll start off with a reader of that 00136 // type, and update it later if needed when we read the XMLDecl line. 00137 // 00138 fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail); 00139 00140 #if defined(XERCES_DEBUG) 00141 if ((fEncoding < XMLRecognizer::Encodings_Min) 00142 || (fEncoding > XMLRecognizer::Encodings_Max)) 00143 { 00144 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager); 00145 } 00146 #endif 00147 00148 fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager); 00149 00150 // Check whether the fSwapped flag should be set or not 00151 checkForSwapped(); 00152 00153 // 00154 // This will check to see if the first line is an XMLDecl and, if 00155 // so, decode that first line manually one character at a time. This 00156 // leaves enough characters in the buffer that the high level code 00157 // can get through the Decl and call us back with the real encoding. 00158 // 00159 doInitDecode(); 00160 00161 // 00162 // NOTE: We won't create a transcoder until we either get a call to 00163 // setEncoding() or we get a call to refreshCharBuffer() and no 00164 // transcoder has been set yet. 00165 // 00166 } 00167 00168 00169 XMLReader::XMLReader(const XMLCh* const pubId 00170 , const XMLCh* const sysId 00171 , BinInputStream* const streamToAdopt 00172 , const XMLCh* const encodingStr 00173 , const RefFrom from 00174 , const Types type 00175 , const Sources source 00176 , const bool throwAtEnd 00177 , const bool calculateSrcOfs 00178 , XMLSize_t lowWaterMark 00179 , const XMLVersion version 00180 , MemoryManager* const manager) : 00181 fCharIndex(0) 00182 , fCharsAvail(0) 00183 , fCurCol(1) 00184 , fCurLine(1) 00185 , fEncoding(XMLRecognizer::UTF_8) 00186 , fEncodingStr(0) 00187 , fForcedEncoding(true) 00188 , fNoMore(false) 00189 , fPublicId(XMLString::replicate(pubId, manager)) 00190 , fRawBufIndex(0) 00191 , fRawBytesAvail(0) 00192 , fLowWaterMark (lowWaterMark) 00193 , fReaderNum(0xFFFFFFFF) 00194 , fRefFrom(from) 00195 , fSentTrailingSpace(false) 00196 , fSource(source) 00197 , fSrcOfsBase(0) 00198 , fSrcOfsSupported(false) 00199 , fCalculateSrcOfs(calculateSrcOfs) 00200 , fSystemId(XMLString::replicate(sysId, manager)) 00201 , fStream(streamToAdopt) 00202 , fSwapped(false) 00203 , fThrowAtEnd(throwAtEnd) 00204 , fTranscoder(0) 00205 , fType(type) 00206 , fMemoryManager(manager) 00207 { 00208 setXMLVersion(version); 00209 00210 // Do an initial load of raw bytes 00211 refreshRawBuffer(); 00212 00213 // Copy the encoding string to our member 00214 fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager); 00215 XMLString::upperCaseASCII(fEncodingStr); 00216 00217 // Ask the transcoding service if it supports src offset info 00218 fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); 00219 00220 // 00221 // Map the passed encoding name to one of our enums. If it does not 00222 // match one of the intrinsic encodings, it will come back 'other', 00223 // which tells us to create a transcoder based reader. 00224 // 00225 fEncoding = XMLRecognizer::encodingForName(fEncodingStr); 00226 00227 // test the presence of the BOM and remove it from the source 00228 switch(fEncoding) 00229 { 00230 case XMLRecognizer::UCS_4B : 00231 case XMLRecognizer::UCS_4L : 00232 { 00233 if (fRawBytesAvail > 4 && 00234 (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) || 00235 ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))) ) 00236 { 00237 fRawBufIndex += 4; 00238 } 00239 break; 00240 } 00241 case XMLRecognizer::UTF_8 : 00242 { 00243 // Look at the raw buffer as short chars 00244 const char* asChars = (const char*)fRawByteBuf; 00245 00246 if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen && 00247 XMLString::compareNString( asChars 00248 , XMLRecognizer::fgUTF8BOM 00249 , XMLRecognizer::fgUTF8BOMLen) == 0) 00250 { 00251 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen; 00252 } 00253 break; 00254 } 00255 case XMLRecognizer::UTF_16B : 00256 case XMLRecognizer::UTF_16L : 00257 { 00258 if (fRawBytesAvail < 2) 00259 break; 00260 00261 const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex]; 00262 if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker)) 00263 { 00264 fRawBufIndex += sizeof(UTF16Ch); 00265 } 00266 break; 00267 } 00268 case XMLRecognizer::EBCDIC: 00269 case XMLRecognizer::US_ASCII: 00270 case XMLRecognizer::XERCES_XMLCH: 00271 case XMLRecognizer::OtherEncoding: 00272 case XMLRecognizer::Encodings_Count: 00273 { 00274 // silence warning about enumeration not being used 00275 break; 00276 } 00277 } 00278 00279 // Check whether the fSwapped flag should be set or not 00280 checkForSwapped(); 00281 00282 // 00283 // Create a transcoder for the encoding. Since the encoding has been 00284 // forced, this will be the one we will use, period. 00285 // 00286 XMLTransService::Codes failReason; 00287 if (fEncoding == XMLRecognizer::OtherEncoding) 00288 { 00289 // 00290 // fEncodingStr not pre-recognized, use it 00291 // directly for transcoder 00292 // 00293 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 00294 ( 00295 fEncodingStr 00296 , failReason 00297 , kCharBufSize 00298 , fMemoryManager 00299 ); 00300 } 00301 else 00302 { 00303 // 00304 // Use the recognized fEncoding to create the transcoder 00305 // 00306 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 00307 ( 00308 fEncoding 00309 , failReason 00310 , kCharBufSize 00311 , fMemoryManager 00312 ); 00313 00314 } 00315 00316 if (!fTranscoder) 00317 { 00318 // We are about to throw which means the d-tor won't be called. 00319 // Clean up some memory. 00320 // 00321 fMemoryManager->deallocate(fPublicId); 00322 fMemoryManager->deallocate(fSystemId); 00323 ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager); 00324 00325 ThrowXMLwithMemMgr1 00326 ( 00327 TranscodingException 00328 , XMLExcepts::Trans_CantCreateCvtrFor 00329 , fEncodingStr 00330 , fMemoryManager 00331 ); 00332 } 00333 00334 // 00335 // Note that, unlike above, we do not do an initial decode of the 00336 // first line. We take the caller's word that the encoding is correct 00337 // and just assume that the first bulk decode (kicked off by the first 00338 // get of a character) will work. 00339 // 00340 // So we do here the slipping in of the leading space if required. 00341 // 00342 if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) 00343 { 00344 // This represents no data from the source 00345 fCharSizeBuf[fCharsAvail] = 0; 00346 fCharOfsBuf[fCharsAvail] = 0; 00347 fCharBuf[fCharsAvail++] = chSpace; 00348 } 00349 } 00350 00351 00352 XMLReader::XMLReader(const XMLCh* const pubId 00353 , const XMLCh* const sysId 00354 , BinInputStream* const streamToAdopt 00355 , XMLRecognizer::Encodings encodingEnum 00356 , const RefFrom from 00357 , const Types type 00358 , const Sources source 00359 , const bool throwAtEnd 00360 , const bool calculateSrcOfs 00361 , XMLSize_t lowWaterMark 00362 , const XMLVersion version 00363 , MemoryManager* const manager) : 00364 fCharIndex(0) 00365 , fCharsAvail(0) 00366 , fCurCol(1) 00367 , fCurLine(1) 00368 , fEncoding(XMLRecognizer::UTF_8) 00369 , fEncodingStr(0) 00370 , fForcedEncoding(true) 00371 , fNoMore(false) 00372 , fPublicId(XMLString::replicate(pubId, manager)) 00373 , fRawBufIndex(0) 00374 , fRawBytesAvail(0) 00375 , fLowWaterMark (lowWaterMark) 00376 , fReaderNum(0xFFFFFFFF) 00377 , fRefFrom(from) 00378 , fSentTrailingSpace(false) 00379 , fSource(source) 00380 , fSrcOfsBase(0) 00381 , fSrcOfsSupported(false) 00382 , fCalculateSrcOfs(calculateSrcOfs) 00383 , fSystemId(XMLString::replicate(sysId, manager)) 00384 , fStream(streamToAdopt) 00385 , fSwapped(false) 00386 , fThrowAtEnd(throwAtEnd) 00387 , fTranscoder(0) 00388 , fType(type) 00389 , fMemoryManager(manager) 00390 { 00391 setXMLVersion(version); 00392 00393 // Do an initial load of raw bytes 00394 refreshRawBuffer(); 00395 00396 // Ask the transcoding service if it supports src offset info 00397 fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); 00398 00399 // 00400 // Use the passed encoding code 00401 // 00402 fEncoding = encodingEnum; 00403 fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager); 00404 00405 // Check whether the fSwapped flag should be set or not 00406 checkForSwapped(); 00407 00408 // 00409 // Create a transcoder for the encoding. Since the encoding has been 00410 // forced, this will be the one we will use, period. 00411 // 00412 XMLTransService::Codes failReason; 00413 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 00414 ( 00415 fEncoding 00416 , failReason 00417 , kCharBufSize 00418 , fMemoryManager 00419 ); 00420 00421 if (!fTranscoder) 00422 { 00423 // We are about to throw which means the d-tor won't be called. 00424 // Clean up some memory. 00425 // 00426 fMemoryManager->deallocate(fPublicId); 00427 fMemoryManager->deallocate(fSystemId); 00428 ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager); 00429 00430 ThrowXMLwithMemMgr1 00431 ( 00432 TranscodingException 00433 , XMLExcepts::Trans_CantCreateCvtrFor 00434 , fEncodingStr 00435 , fMemoryManager 00436 ); 00437 } 00438 00439 // 00440 // Note that, unlike above, we do not do an initial decode of the 00441 // first line. We take the caller's word that the encoding is correct 00442 // and just assume that the first bulk decode (kicked off by the first 00443 // get of a character) will work. 00444 // 00445 // So we do here the slipping in of the leading space if required. 00446 // 00447 if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) 00448 { 00449 // This represents no data from the source 00450 fCharSizeBuf[fCharsAvail] = 0; 00451 fCharOfsBuf[fCharsAvail] = 0; 00452 fCharBuf[fCharsAvail++] = chSpace; 00453 } 00454 } 00455 00456 00457 XMLReader::~XMLReader() 00458 { 00459 fMemoryManager->deallocate(fEncodingStr); 00460 fMemoryManager->deallocate(fPublicId); 00461 fMemoryManager->deallocate(fSystemId); 00462 delete fStream; 00463 delete fTranscoder; 00464 } 00465 00466 00467 // --------------------------------------------------------------------------- 00468 // XMLReader: Character buffer management methods 00469 // --------------------------------------------------------------------------- 00470 XMLFilePos XMLReader::getSrcOffset() const 00471 { 00472 if (!fSrcOfsSupported || !fCalculateSrcOfs) 00473 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager); 00474 00475 // 00476 // Take the current source offset and add in the sizes that we've 00477 // eaten from the source so far. 00478 // 00479 if( fCharIndex == 0 ) { 00480 return fSrcOfsBase; 00481 } 00482 00483 if( fCharIndex < fCharsAvail ) { 00484 00485 return (fSrcOfsBase + fCharOfsBuf[fCharIndex]); 00486 } 00487 00488 return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]); 00489 } 00490 00491 00492 bool XMLReader::refreshCharBuffer() 00493 { 00494 // If the no more flag is set, then don't bother doing anything. 00495 if (fNoMore) 00496 return false; 00497 00498 XMLSize_t startInd; 00499 00500 // See if we have any existing chars. 00501 const XMLSize_t spareChars = fCharsAvail - fCharIndex; 00502 00503 // If we are full, then don't do anything. 00504 if (spareChars == kCharBufSize) 00505 return true; 00506 00507 // 00508 // If no transcoder has been created yet, then we never saw the 00509 // any encoding="" string and the encoding was not forced, so lets 00510 // create one now. We know that it won't change now. 00511 // 00512 // However, note that if we autosensed EBCDIC, then we have to 00513 // consider it an error if we never got an encoding since we don't 00514 // know what variant of EBCDIC it is. 00515 // 00516 if (!fTranscoder) 00517 { 00518 if (fEncoding == XMLRecognizer::EBCDIC) 00519 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager); 00520 00521 // Ask the transcoding service to make use a transcoder 00522 XMLTransService::Codes failReason; 00523 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 00524 ( 00525 fEncodingStr 00526 , failReason 00527 , kCharBufSize 00528 , fMemoryManager 00529 ); 00530 00531 if (!fTranscoder) 00532 { 00533 ThrowXMLwithMemMgr1 00534 ( 00535 TranscodingException 00536 , XMLExcepts::Trans_CantCreateCvtrFor 00537 , fEncodingStr 00538 , fMemoryManager 00539 ); 00540 } 00541 } 00542 00543 // 00544 // Add the number of source bytes eaten so far to the base src 00545 // offset member. 00546 // 00547 if (fCalculateSrcOfs) { 00548 for (startInd = 0; startInd < fCharIndex; startInd++) 00549 fSrcOfsBase += fCharSizeBuf[startInd]; 00550 } 00551 00552 // 00553 // If there are spare chars, then move then down to the bottom. We 00554 // have to move the char sizes down also. 00555 // 00556 startInd = 0; 00557 if (spareChars) 00558 { 00559 for (XMLSize_t index = fCharIndex; index < fCharsAvail; index++) 00560 { 00561 fCharBuf[startInd] = fCharBuf[index]; 00562 fCharSizeBuf[startInd] = fCharSizeBuf[index]; 00563 startInd++; 00564 } 00565 } 00566 00567 // 00568 // And then get more chars, starting after any spare chars that were 00569 // left over from the last time. 00570 // 00571 fCharsAvail = xcodeMoreChars 00572 ( 00573 &fCharBuf[startInd] 00574 , &fCharSizeBuf[startInd] 00575 , kCharBufSize - spareChars 00576 ); 00577 00578 // Add back in the spare chars 00579 fCharsAvail += spareChars; 00580 00581 // Reset the buffer index to zero, so we start from the 0th char again 00582 fCharIndex = 0; 00583 00584 // 00585 // If no chars available, then we have to check for one last thing. If 00586 // this is reader for a PE and its not being expanded inside a literal, 00587 // then unget a trailing space. We use a boolean to avoid triggering 00588 // this more than once. 00589 // 00590 if (!fCharsAvail 00591 && (fType == Type_PE) 00592 && (fRefFrom == RefFrom_NonLiteral) 00593 && !fSentTrailingSpace) 00594 { 00595 fCharBuf[0] = chSpace; 00596 fCharsAvail = 1; 00597 fSentTrailingSpace = true; 00598 } 00599 00600 // 00601 // If we get here with no more chars, then set the fNoMore flag which 00602 // lets us optimize and know without checking that no more chars are 00603 // available. 00604 // 00605 if (!fCharsAvail) 00606 fNoMore = true; 00607 00608 // Calculate fCharOfsBuf using the elements from fCharBufSize 00609 if (fCalculateSrcOfs) 00610 { 00611 unsigned int last = 0; 00612 fCharOfsBuf[0] = 0; 00613 for (XMLSize_t index = 1; index < fCharsAvail; ++index) { 00614 fCharOfsBuf[index] = last+fCharSizeBuf[index-1]; 00615 last = fCharOfsBuf[index]; 00616 // code was: 00617 // fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1]; 00618 // but on Solaris 64 bit with sun studio 11 this didn't work as 00619 // every value of fCharOfsBuf[] was 1. 00620 } 00621 } 00622 00623 return (fCharsAvail != 0); 00624 } 00625 00626 00627 00628 // --------------------------------------------------------------------------- 00629 // XMLReader: Scanning methods 00630 // --------------------------------------------------------------------------- 00631 bool XMLReader::getName(XMLBuffer& toFill, const bool token) 00632 { 00633 // Ok, first lets see if we have chars in the buffer. If not, then lets 00634 // reload. 00635 if (fCharIndex == fCharsAvail) 00636 { 00637 if (!refreshCharBuffer()) 00638 return false; 00639 } 00640 00641 XMLSize_t charIndex_start = fCharIndex; 00642 00643 // Lets check the first char for being a first name char. If not, then 00644 // what's the point in living mannnn? Just give up now. We only do this 00645 // if its a name and not a name token that they want. 00646 if (!token) 00647 { 00648 if (fXMLVersion == XMLV1_1 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) { 00649 // make sure one more char is in the buffer, the transcoder 00650 // should put only a complete surrogate pair into the buffer 00651 assert(fCharIndex+1 < fCharsAvail); 00652 if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) 00653 return false; 00654 00655 // Looks ok, so lets eat it 00656 fCharIndex += 2; 00657 } 00658 else { 00659 if (!isFirstNameChar(fCharBuf[fCharIndex])) 00660 return false; 00661 00662 // Looks ok, so lets eat it 00663 fCharIndex ++; 00664 } 00665 00666 } 00667 00668 // And now we loop until we run out of data in this reader or we hit 00669 // a non-name char. 00670 while (true) 00671 { 00672 if (fXMLVersion == XMLV1_1) 00673 { 00674 while (fCharIndex < fCharsAvail) 00675 { 00676 // Check the current char and take it if its a name char. Else 00677 // break out. 00678 if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) ) 00679 { 00680 // make sure one more char is in the buffer, the transcoder 00681 // should put only a complete surrogate pair into the buffer 00682 assert(fCharIndex+1 < fCharsAvail); 00683 if ( (fCharBuf[fCharIndex+1] < 0xDC00) || 00684 (fCharBuf[fCharIndex+1] > 0xDFFF) ) 00685 break; 00686 fCharIndex += 2; 00687 00688 } 00689 else 00690 { 00691 if (!isNameChar(fCharBuf[fCharIndex])) 00692 break; 00693 fCharIndex++; 00694 } 00695 } 00696 } 00697 else // XMLV1_0 00698 { 00699 while (fCharIndex < fCharsAvail) 00700 { 00701 if (!isNameChar(fCharBuf[fCharIndex])) 00702 break; 00703 fCharIndex++; 00704 } 00705 } 00706 00707 // we have to copy the accepted character(s), and update column 00708 if (fCharIndex != charIndex_start) 00709 { 00710 fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start); 00711 toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start); 00712 } 00713 00714 // something is wrong if there is still something in the buffer 00715 // or if we don't get no more, then break out. 00716 if ((fCharIndex < fCharsAvail) || 00717 !refreshCharBuffer()) 00718 break; 00719 00720 charIndex_start = fCharIndex; 00721 } 00722 00723 return !toFill.isEmpty(); 00724 } 00725 00726 bool XMLReader::getNCName(XMLBuffer& toFill) 00727 { 00728 if (fCharIndex == fCharsAvail && !refreshCharBuffer()) 00729 return false; 00730 00731 XMLSize_t charIndex_start = fCharIndex, count; 00732 // Lets check the first char for being a first name char. If not, then 00733 // what's the point in living mannnn? Just give up now. We only do this 00734 // if its a name and not a name token that they want. 00735 if (fXMLVersion == XMLV1_1 00736 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) { 00737 // make sure one more char is in the buffer, the transcoder 00738 // should put only a complete surrogate pair into the buffer 00739 assert(fCharIndex+1 < fCharsAvail); 00740 if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) 00741 return false; 00742 00743 // Looks ok, so lets eat it 00744 fCharIndex += 2; 00745 } 00746 else { 00747 if (!isFirstNCNameChar(fCharBuf[fCharIndex])) { 00748 return false; 00749 } 00750 00751 // Looks ok, so lets eat it 00752 fCharIndex++; 00753 } 00754 00755 do 00756 { 00757 if (fCharIndex == fCharsAvail) 00758 { 00759 // we have to copy the accepted character(s), and update the column number, 00760 // before getting new data and losing the value of fCharIndex 00761 if((count = fCharIndex - charIndex_start)!=0) 00762 { 00763 fCurCol += (XMLFileLoc)count; 00764 toFill.append(&fCharBuf[charIndex_start], count); 00765 } 00766 if(!refreshCharBuffer()) 00767 return true; 00768 charIndex_start = fCharIndex; 00769 } 00770 00771 // Check the current char and take it if it's a name char 00772 if (fXMLVersion == XMLV1_1) 00773 { 00774 while(fCharIndex < fCharsAvail) 00775 { 00776 if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++; 00777 else if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2; 00778 else break; 00779 } 00780 } 00781 else 00782 while(fCharIndex < fCharsAvail && isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++; 00783 // if we didn't consume the entire buffer, we are done 00784 } while(fCharIndex == fCharsAvail); 00785 00786 // we have to copy the accepted character(s), and update column 00787 if((count = fCharIndex - charIndex_start)!=0) 00788 { 00789 fCurCol += (XMLFileLoc)count; 00790 toFill.append(&fCharBuf[charIndex_start], count); 00791 } 00792 return true; 00793 } 00794 00795 bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition) 00796 { 00797 // We are only looking for two iterations (i.e. 'NCNAME':'NCNAME'). 00798 // We will stop when we finished scanning for a QName (i.e. either a second 00799 // colon or an invalid char). 00800 if(!getNCName(toFill)) 00801 { 00802 *colonPosition = -1; 00803 return false; 00804 } 00805 if (fCharIndex == fCharsAvail && !refreshCharBuffer()) 00806 { 00807 *colonPosition = -1; 00808 return true; 00809 } 00810 if (fCharBuf[fCharIndex] != chColon) 00811 { 00812 *colonPosition = -1; 00813 return true; 00814 } 00815 00816 *colonPosition = (int)toFill.getLen(); 00817 toFill.append(chColon); 00818 fCharIndex++; 00819 fCurCol++; 00820 return getNCName(toFill); 00821 } 00822 00823 bool XMLReader::getSpaces(XMLBuffer& toFill) 00824 { 00825 // 00826 // We just loop until we either hit a non-space or the end of this 00827 // entity. We return true if we returned because of a non-space and 00828 // false if because of end of entity. 00829 // 00830 // NOTE: We have to maintain line/col info here and we have to do 00831 // whitespace normalization if we are not already internalized. 00832 // 00833 while (true) 00834 { 00835 // Loop through the current chars in the buffer 00836 while (fCharIndex < fCharsAvail) 00837 { 00838 // Get the current char out of the buffer 00839 XMLCh curCh = fCharBuf[fCharIndex]; 00840 00841 // 00842 // See if its a white space char. If so, then process it. Else 00843 // we've hit a non-space and need to return. 00844 // 00845 if (isWhitespace(curCh)) 00846 { 00847 // Eat this char 00848 fCharIndex++; 00849 00850 // 00851 // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have 00852 // end-of-line combinations with a leading chCR(xD) or chLF(xA) 00853 // 00854 // 100000 x20 00855 // 001001 x9 00856 // 001010 chLF 00857 // 001101 chCR 00858 // ----------- 00859 // 000110 == (chCR|chLF) & ~(0x9|0x20) 00860 // 00861 // if the result of thelogical-& operation is 00862 // true : 'curCh' must be xA or xD 00863 // false : 'curCh' must be x20 or x9 00864 // 00865 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) 00866 { 00867 fCurCol++; 00868 } else 00869 { 00870 handleEOL(curCh, false); 00871 } 00872 00873 // Ok we can add this guy to our buffer 00874 toFill.append(curCh); 00875 } 00876 else 00877 { 00878 // Return true to indicate we broke out due to a whitespace 00879 return true; 00880 } 00881 } 00882 00883 // 00884 // We've eaten up the current buffer, so lets try to reload it. If 00885 // we don't get anything new, then break out. If we do, then we go 00886 // back to the top to keep getting spaces. 00887 // 00888 if (!refreshCharBuffer()) 00889 break; 00890 } 00891 return false; 00892 } 00893 00894 00895 bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck) 00896 { 00897 while (true) 00898 { 00899 // Loop through the current chars in the buffer 00900 while (fCharIndex < fCharsAvail) 00901 { 00902 // Get the current char out of the buffer 00903 XMLCh curCh = fCharBuf[fCharIndex]; 00904 00905 // 00906 // See if its not a white space or our target char, then process 00907 // it. Else, we need to return. 00908 // 00909 if (!isWhitespace(curCh) && (curCh != toCheck)) 00910 { 00911 // Eat this char 00912 fCharIndex++; 00913 00914 // 00915 // 'curCh' is not a whitespace(x20|x9|xD|xA), so we only can 00916 // have end-of-line combinations with a leading chNEL(x85) or 00917 // chLineSeparator(x2028) 00918 // 00919 // 0010000000101000 chLineSeparator 00920 // 0000000010000101 chNEL 00921 // --------------------- 00922 // 1101111101010010 == ~(chNEL|chLineSeparator) 00923 // 00924 // if the result of the logical-& operation is 00925 // true : 'curCh' can not be chNEL or chLineSeparator 00926 // false : 'curCh' can be chNEL or chLineSeparator 00927 // 00928 if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) ) 00929 { 00930 fCurCol++; 00931 } else 00932 { 00933 handleEOL(curCh, false); 00934 } 00935 00936 // Add it to our buffer 00937 toFill.append(curCh); 00938 } 00939 else 00940 { 00941 return true; 00942 } 00943 } 00944 00945 // 00946 // We've eaten up the current buffer, so lets try to reload it. If 00947 // we don't get anything new, then break out. If we do, then we go 00948 // back to the top to keep getting spaces. 00949 // 00950 if (!refreshCharBuffer()) 00951 break; 00952 } 00953 00954 // We never hit any non-space and ate up the whole reader 00955 return false; 00956 00957 } 00958 00959 bool XMLReader::skipIfQuote(XMLCh& chGotten) 00960 { 00961 if (fCharIndex == fCharsAvail && !refreshCharBuffer()) 00962 return false; 00963 00964 chGotten = fCharBuf[fCharIndex]; 00965 if ((chGotten == chDoubleQuote) || (chGotten == chSingleQuote)) 00966 { 00967 fCharIndex++; 00968 fCurCol++; 00969 return true; 00970 } 00971 return false; 00972 } 00973 00974 00975 bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl) 00976 { 00977 // DO NOT set the skippedSomething to 'false', but change it to be 'true' only 00978 00979 // We enter a loop where we skip over spaces until we hit the end of 00980 // this reader or a non-space value. The return indicates whether we 00981 // hit the non-space (true) or the end (false). 00982 do 00983 { 00984 // Loop through the current chars in the buffer 00985 while (fCharIndex < fCharsAvail) 00986 { 00987 // See if its a white space char. If so, then process it. Else 00988 // we've hit a non-space and need to return. 00989 if (isWhitespace(fCharBuf[fCharIndex])) 00990 { 00991 // Get the current char out of the buffer and eat it 00992 XMLCh curCh = fCharBuf[fCharIndex++]; 00993 skippedSomething = true; 00994 // 00995 // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have 00996 // end-of-line combinations with a leading chCR(xD) or chLF(xA) 00997 // 00998 // 100000 x20 00999 // 001001 x9 01000 // 001010 chLF 01001 // 001101 chCR 01002 // ----------- 01003 // 000110 == (chCR|chLF) & ~(0x9|0x20) 01004 // 01005 // if the result of the logical-& operation is 01006 // true : 'curCh' must be xA or xD 01007 // false : 'curCh' must be x20 or x9 01008 // 01009 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) 01010 { 01011 fCurCol++; 01012 } else 01013 { 01014 handleEOL(curCh, inDecl); 01015 } 01016 } 01017 else 01018 return true; 01019 } 01020 01021 // We've eaten up the current buffer, so lets try to reload it. If 01022 // we don't get anything new, then break out. If we do, then we go 01023 // back to the top to keep getting spaces. 01024 } while(refreshCharBuffer()); 01025 01026 // We never hit any non-space and ate up the whole reader 01027 return false; 01028 } 01029 01030 bool XMLReader::skippedChar(const XMLCh toSkip) 01031 { 01032 // 01033 // If the buffer is empty, then try to reload it. If we still get 01034 // nothing, then return false. 01035 // 01036 if (fCharIndex == fCharsAvail) 01037 { 01038 if (!refreshCharBuffer()) 01039 return false; 01040 } 01041 01042 // 01043 // See if the current char is the one we want. If so, then we need 01044 // to eat it and return true. 01045 // 01046 if (fCharBuf[fCharIndex] == toSkip) 01047 { 01048 fCharIndex++; 01049 fCurCol++; 01050 return true; 01051 } 01052 return false; 01053 } 01054 01055 01056 bool XMLReader::skippedSpace() 01057 { 01058 // 01059 // If the buffer is empty, then try to reload it. If we still get 01060 // nothing, then return false. 01061 // 01062 if (fCharIndex == fCharsAvail) 01063 { 01064 if (!refreshCharBuffer()) 01065 return false; 01066 } 01067 01068 // 01069 // See if the current char is a whitespace. If so, then we need to eat 01070 // it and return true. 01071 // 01072 const XMLCh curCh = fCharBuf[fCharIndex]; 01073 if (isWhitespace(curCh)) 01074 { 01075 // Eat the character 01076 fCharIndex++; 01077 01078 // 01079 // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have 01080 // end-of-line combinations with a leading chCR(xD) or chLF(xA) 01081 // 01082 // 100000 x20 01083 // 001001 x9 01084 // 001010 chLF 01085 // 001101 chCR 01086 // ----------- 01087 // 000110 == (chCR|chLF) & ~(0x9|0x20) 01088 // 01089 // if the result of the logical-& operation is 01090 // true : 'curCh' must be xA or xD 01091 // false : 'curCh' must be x20 or x9 01092 // 01093 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) 01094 { 01095 fCurCol++; 01096 } else 01097 { 01098 handleEOL((XMLCh&)curCh, false); 01099 } 01100 01101 return true; 01102 } 01103 return false; 01104 } 01105 01106 bool XMLReader::skippedString(const XMLCh* const toSkip) 01107 { 01108 // This function works on strings that are smaller than kCharBufSize. 01109 // This function guarantees that in case the comparison is unsuccessful 01110 // the fCharIndex will point to the original data. 01111 // 01112 01113 // Get the length of the string to skip. 01114 // 01115 const XMLSize_t srcLen = XMLString::stringLen(toSkip); 01116 XMLSize_t charsLeft = charsLeftInBuffer(); 01117 01118 // See if the current reader has enough chars to test against this 01119 // string. If not, then ask it to reload its buffer. If that does not 01120 // get us enough, then it cannot match. 01121 // 01122 // NOTE: This works because strings never have to cross a reader! And 01123 // a string to skip will never have a new line in it, so we will never 01124 // miss adjusting the current line. 01125 // 01126 while (charsLeft < srcLen) 01127 { 01128 if (!refreshCharBuffer()) 01129 return false; 01130 01131 XMLSize_t tmp = charsLeftInBuffer(); 01132 if (tmp == charsLeft) // if the refreshCharBuf() did not add anything new 01133 return false; // give up and return. 01134 01135 charsLeft = tmp; 01136 } 01137 01138 // Ok, now we now that the current reader has enough chars in its 01139 // buffer and that its index is back at zero. So we can do a quick and 01140 // dirty comparison straight to its buffer with no requirement to unget 01141 // if it fails. 01142 // 01143 if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen * sizeof(XMLCh))) 01144 return false; 01145 01146 // Add the source length to the current column to get it back right. 01147 // 01148 fCurCol += (XMLFileLoc)srcLen; 01149 01150 // And get the character buffer index back right by just adding the 01151 // source len to it. 01152 // 01153 fCharIndex += srcLen; 01154 01155 return true; 01156 } 01157 01158 bool XMLReader::skippedStringLong(const XMLCh* toSkip) 01159 { 01160 // This function works on strings that are potentially longer than 01161 // kCharBufSize (e.g., end tag). This function does not guarantee 01162 // that in case the comparison is unsuccessful the fCharIndex will 01163 // point to the original data. 01164 // 01165 01166 XMLSize_t srcLen = XMLString::stringLen(toSkip); 01167 XMLSize_t charsLeft = charsLeftInBuffer(); 01168 01169 while (srcLen != 0) 01170 { 01171 // Fill up the buffer with as much data as possible. 01172 // 01173 while (charsLeft < srcLen && charsLeft != kCharBufSize) 01174 { 01175 if (!refreshCharBuffer()) 01176 return false; 01177 01178 XMLSize_t tmp = charsLeftInBuffer(); 01179 if (tmp == charsLeft) // if the refreshCharBuf() did not add anything 01180 return false; // new give up and return. 01181 01182 charsLeft = tmp; 01183 } 01184 01185 XMLSize_t n = charsLeft < srcLen ? charsLeft : srcLen; 01186 01187 if (memcmp(&fCharBuf[fCharIndex], toSkip, n * sizeof(XMLCh))) 01188 return false; 01189 01190 toSkip += n; 01191 srcLen -= n; 01192 01193 fCharIndex += n; 01194 fCurCol += (XMLFileLoc)n; 01195 charsLeft -= n; 01196 } 01197 01198 return true; 01199 } 01200 01201 // 01202 // This is just to peek if the next coming buffer 01203 // matches the string toPeek. 01204 // Similar to skippedString, but just the fCharIndex and fCurCol are not updated 01205 // 01206 bool XMLReader::peekString(const XMLCh* const toPeek) 01207 { 01208 // Get the length of the string to skip 01209 const XMLSize_t srcLen = XMLString::stringLen(toPeek); 01210 01211 // 01212 // See if the current reader has enough chars to test against this 01213 // string. If not, then ask it to reload its buffer. If that does not 01214 // get us enough, then it cannot match. 01215 // 01216 // NOTE: This works because strings never have to cross a reader! And 01217 // a string to skip will never have a new line in it, so we will never 01218 // miss adjusting the current line. 01219 // 01220 XMLSize_t charsLeft = charsLeftInBuffer(); 01221 while (charsLeft < srcLen) 01222 { 01223 refreshCharBuffer(); 01224 XMLSize_t t = charsLeftInBuffer(); 01225 if (t == charsLeft) // if the refreshCharBuf() did not add anything new 01226 return false; // give up and return. 01227 charsLeft = t; 01228 } 01229 01230 01231 01232 01233 // 01234 // Ok, now we now that the current reader has enough chars in its 01235 // buffer and that its index is back at zero. So we can do a quick and 01236 // dirty comparison straight to its buffer with no requirement to unget 01237 // if it fails. 01238 // 01239 if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh))) 01240 return false; 01241 01242 return true; 01243 } 01244 01245 01246 // --------------------------------------------------------------------------- 01247 // XMLReader: Setter methods (most are inlined) 01248 // --------------------------------------------------------------------------- 01249 bool XMLReader::setEncoding(const XMLCh* const newEncoding) 01250 { 01251 // 01252 // If the encoding was forced, then we ignore the new value and just 01253 // return with success. If it was forced, then we are to use that 01254 // encoding without question. Note that, if we are forced, we created 01255 // a transcoder up front so there is no need to do one here in that 01256 // case. 01257 // 01258 if (fForcedEncoding) 01259 return true; 01260 01261 // 01262 // upperCase the newEncoding first for better performance 01263 // 01264 XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager); 01265 XMLString::upperCaseASCII(inputEncoding); 01266 01267 XMLRecognizer::Encodings newBaseEncoding; 01268 // 01269 // Check for non-endian specific UTF-16 or UCS-4. If so, and if we 01270 // are already in one of the endian versions of those encodings, 01271 // then just keep it and go on. Otherwise, its not valid. 01272 // 01273 if (XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString) 01274 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString2) 01275 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString3) 01276 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString4) 01277 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString5) 01278 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString6) 01279 || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString7)) 01280 { 01281 fMemoryManager->deallocate(inputEncoding); 01282 01283 if ((fEncoding != XMLRecognizer::UTF_16L) 01284 && (fEncoding != XMLRecognizer::UTF_16B)) 01285 { 01286 return false; 01287 } 01288 01289 // Override with the original endian specific encoding 01290 newBaseEncoding = fEncoding; 01291 01292 if (fEncoding == XMLRecognizer::UTF_16L) { 01293 fMemoryManager->deallocate(fEncodingStr); 01294 fEncodingStr = 0; 01295 fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager); 01296 } 01297 else { 01298 fMemoryManager->deallocate(fEncodingStr); 01299 fEncodingStr = 0; 01300 fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager); 01301 } 01302 } 01303 else if (XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString) 01304 || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString2) 01305 || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString3) 01306 || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString4) 01307 || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString5)) 01308 { 01309 fMemoryManager->deallocate(inputEncoding); 01310 01311 if ((fEncoding != XMLRecognizer::UCS_4L) 01312 && (fEncoding != XMLRecognizer::UCS_4B)) 01313 { 01314 return false; 01315 } 01316 01317 // Override with the original endian specific encoding 01318 newBaseEncoding = fEncoding; 01319 01320 if (fEncoding == XMLRecognizer::UCS_4L) { 01321 01322 fMemoryManager->deallocate(fEncodingStr); 01323 fEncodingStr = 0; 01324 fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager); 01325 } 01326 else { 01327 01328 fMemoryManager->deallocate(fEncodingStr); 01329 fEncodingStr = 0; 01330 fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager); 01331 } 01332 } 01333 else 01334 { 01335 // 01336 // Try to map the string to one of our standard encodings. If its not 01337 // one of them, then it has to be one of the non-intrinsic encodings, 01338 // in which case we have to delete our intrinsic encoder and create a 01339 // new one. 01340 // 01341 newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding); 01342 01343 // 01344 // If it does not come back as one of the auto-sensed encodings, then we 01345 // have to possibly replace it and at least check a few things. 01346 // 01347 if (newBaseEncoding == XMLRecognizer::OtherEncoding) 01348 { 01349 // 01350 // We already know it's none of those non-endian special cases, 01351 // so just replicate the new name and use it directly to create the transcoder 01352 // 01353 fMemoryManager->deallocate(fEncodingStr); 01354 fEncodingStr = inputEncoding; 01355 01356 XMLTransService::Codes failReason; 01357 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 01358 ( 01359 fEncodingStr 01360 , failReason 01361 , kCharBufSize 01362 , fMemoryManager 01363 ); 01364 } 01365 else 01366 { 01367 // Store the new encoding string since it is just an intrinsic 01368 fMemoryManager->deallocate(fEncodingStr); 01369 fEncodingStr = inputEncoding; 01370 } 01371 } 01372 01373 if (!fTranscoder) { 01374 // 01375 // Now we can create a transcoder using the recognized fEncoding. We 01376 // might get back a transcoder for an intrinsically supported encoding, 01377 // or we might get one from the underlying transcoding service. 01378 // 01379 XMLTransService::Codes failReason; 01380 fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor 01381 ( 01382 newBaseEncoding 01383 , failReason 01384 , kCharBufSize 01385 , fMemoryManager 01386 ); 01387 01388 if (!fTranscoder) 01389 ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager); 01390 } 01391 01392 // Update the base encoding member with the new base encoding found 01393 fEncoding = newBaseEncoding; 01394 01395 // Looks ok to us 01396 return true; 01397 } 01398 01399 01400 // --------------------------------------------------------------------------- 01401 // XMLReader: Private helper methods 01402 // --------------------------------------------------------------------------- 01403 01404 // 01405 // This is called when the encoding flag is set and just sets the fSwapped 01406 // flag appropriately. 01407 // 01408 void XMLReader::checkForSwapped() 01409 { 01410 // Assume not swapped 01411 fSwapped = false; 01412 01413 if (XMLPlatformUtils::fgXMLChBigEndian) 01414 { 01415 if ((fEncoding == XMLRecognizer::UTF_16L) 01416 || (fEncoding == XMLRecognizer::UCS_4L)) 01417 { 01418 fSwapped = true; 01419 } 01420 } 01421 else 01422 { 01423 if ((fEncoding == XMLRecognizer::UTF_16B) 01424 || (fEncoding == XMLRecognizer::UCS_4B)) 01425 { 01426 fSwapped = true; 01427 } 01428 } 01429 } 01430 01431 01432 // 01433 // This is called from the constructor when the encoding is not forced. 01434 // We assume that the encoding has been auto-sensed at this point and that 01435 // fSwapped is set correctly. 01436 // 01437 // In the case of UCS-4 and EBCDIC, we don't have to check for a decl. 01438 // The fact that we got here, means that there is one, because that's the 01439 // only way we can autosense those. 01440 // 01441 void XMLReader::doInitDecode() 01442 { 01443 switch(fEncoding) 01444 { 01445 case XMLRecognizer::UCS_4B : 01446 case XMLRecognizer::UCS_4L : 01447 { 01448 // Remove bom if any 01449 if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) || 01450 ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) ) 01451 { 01452 for (XMLSize_t i = 0; i < fRawBytesAvail; i++) 01453 fRawByteBuf[i] = fRawByteBuf[i+4]; 01454 01455 fRawBytesAvail -=4; 01456 } 01457 01458 // Look at the raw buffer as UCS4 chars 01459 const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf; 01460 01461 while (fRawBufIndex < fRawBytesAvail) 01462 { 01463 // Get out the current 4 byte value and inc our raw buf index 01464 UCS4Ch curVal = *asUCS++; 01465 fRawBufIndex += sizeof(UCS4Ch); 01466 01467 // Swap if that is required for this machine 01468 if (fSwapped) 01469 curVal = BitOps::swapBytes(curVal); 01470 01471 // Make sure its at least semi legal. If not, undo and throw 01472 if (curVal > 0xFFFF) 01473 { 01474 fCharsAvail = 0; 01475 fRawBufIndex = 0; 01476 fMemoryManager->deallocate(fPublicId); 01477 fMemoryManager->deallocate(fEncodingStr); 01478 ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager); 01479 ThrowXMLwithMemMgr1 01480 ( 01481 TranscodingException 01482 , XMLExcepts::Reader_CouldNotDecodeFirstLine 01483 , fSystemId 01484 , fMemoryManager 01485 ); 01486 } 01487 01488 // Convert the value to an XML char and store it 01489 fCharSizeBuf[fCharsAvail] = 4; 01490 fCharBuf[fCharsAvail++] = XMLCh(curVal); 01491 01492 // Break out on the > character 01493 if (curVal == chCloseAngle) 01494 break; 01495 } 01496 break; 01497 } 01498 01499 case XMLRecognizer::UTF_8 : 01500 { 01501 // If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it. 01502 // Don't move to char buf - no one wants to see it. 01503 // Note: this causes any encoding= declaration to override 01504 // the BOM's attempt to say that the encoding is utf-8. 01505 01506 // Look at the raw buffer as short chars 01507 const char* asChars = (const char*)fRawByteBuf; 01508 01509 if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen && 01510 XMLString::compareNString( asChars 01511 , XMLRecognizer::fgUTF8BOM 01512 , XMLRecognizer::fgUTF8BOMLen) == 0) 01513 { 01514 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen; 01515 asChars += XMLRecognizer::fgUTF8BOMLen; 01516 } 01517 01518 // 01519 // First check that there are enough bytes to even see the 01520 // decl indentifier. If not, get out now with no action since 01521 // there is no decl. 01522 // 01523 if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen) 01524 break; 01525 01526 // Check for the opening sequence. If not, then no decl 01527 if (XMLString::compareNString( asChars 01528 , XMLRecognizer::fgASCIIPre 01529 , XMLRecognizer::fgASCIIPreLen)) 01530 { 01531 break; 01532 } 01533 01534 while (fRawBufIndex < fRawBytesAvail) 01535 { 01536 const char curCh = *asChars++; 01537 fRawBufIndex++; 01538 01539 // Looks ok, so store it 01540 fCharSizeBuf[fCharsAvail] = 1; 01541 fCharBuf[fCharsAvail++] = XMLCh(curCh); 01542 01543 // Break out on a > character 01544 if (curCh == chCloseAngle) 01545 break; 01546 01547 // 01548 // A char greater than 0x7F is not allowed in this case. If 01549 // so, undo and throw. 01550 // 01551 if (curCh & 0x80) 01552 { 01553 fCharsAvail = 0; 01554 fRawBufIndex = 0; 01555 fMemoryManager->deallocate(fPublicId); 01556 fMemoryManager->deallocate(fEncodingStr); 01557 ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager); 01558 ThrowXMLwithMemMgr1 01559 ( 01560 TranscodingException 01561 , XMLExcepts::Reader_CouldNotDecodeFirstLine 01562 , fSystemId 01563 , fMemoryManager 01564 ); 01565 } 01566 } 01567 break; 01568 } 01569 01570 case XMLRecognizer::UTF_16B : 01571 case XMLRecognizer::UTF_16L : 01572 { 01573 // 01574 // If there is a decl here, we just truncate back the characters 01575 // as we go. No surrogate creation would be allowed here in legal 01576 // XML, so we consider it a transoding error if we find one. 01577 // 01578 if (fRawBytesAvail < 2) 01579 break; 01580 01581 XMLSize_t postBOMIndex = 0; 01582 const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex]; 01583 if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker)) 01584 { 01585 fRawBufIndex += sizeof(UTF16Ch); 01586 asUTF16++; 01587 postBOMIndex = fRawBufIndex; 01588 } 01589 01590 // First check that there are enough raw bytes for there to even 01591 // be a decl indentifier. If not, then nothing to do. 01592 // 01593 if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen) 01594 { 01595 fRawBufIndex = postBOMIndex; 01596 break; 01597 } 01598 01599 // 01600 // See we get a match on the prefix. If not, then reset and 01601 // break out. 01602 // 01603 if (fEncoding == XMLRecognizer::UTF_16B) 01604 { 01605 if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen)) 01606 { 01607 fRawBufIndex = postBOMIndex; 01608 break; 01609 } 01610 } 01611 else 01612 { 01613 if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen)) 01614 { 01615 fRawBufIndex = postBOMIndex; 01616 break; 01617 } 01618 } 01619 01620 while (fRawBufIndex < fRawBytesAvail) 01621 { 01622 // Get out the current 2 byte value 01623 UTF16Ch curVal = *asUTF16++; 01624 fRawBufIndex += sizeof(UTF16Ch); 01625 01626 // Swap if that is required for this machine 01627 if (fSwapped) 01628 curVal = BitOps::swapBytes(curVal); 01629 01630 // 01631 // Store it and bump the target index, implicitly converting 01632 // if UTF16Ch and XMLCh are not the same size. 01633 // 01634 fCharSizeBuf[fCharsAvail] = 2; 01635 fCharBuf[fCharsAvail++] = curVal; 01636 01637 // Break out on a > char 01638 if (curVal == chCloseAngle) 01639 break; 01640 } 01641 break; 01642 } 01643 01644 case XMLRecognizer::EBCDIC : 01645 { 01646 // 01647 // We use special support in the intrinsic EBCDIC-US transcoder 01648 // to go through one char at a time. 01649 // 01650 const XMLByte* srcPtr = fRawByteBuf; 01651 while (1) 01652 { 01653 // Transcode one char from the source 01654 const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++); 01655 fRawBufIndex++; 01656 01657 // 01658 // And put it into the character buffer. This stuff has to 01659 // look like it was normally transcoded. 01660 // 01661 fCharSizeBuf[fCharsAvail] = 1; 01662 fCharBuf[fCharsAvail++] = chCur; 01663 01664 // If its a > char, then break out 01665 if (chCur == chCloseAngle) 01666 break; 01667 01668 // Watch for using up all input and get out 01669 if (fRawBufIndex == fRawBytesAvail) 01670 break; 01671 } 01672 break; 01673 } 01674 01675 default : 01676 // It should never be anything else here 01677 fMemoryManager->deallocate(fPublicId); 01678 fMemoryManager->deallocate(fEncodingStr); 01679 fMemoryManager->deallocate(fSystemId); 01680 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager); 01681 break; 01682 } 01683 01684 // 01685 // Ok, by the time we get here, if its a legal XML file we have eaten 01686 // the XML/TextDecl. So, if we are a PE and are being referenced from 01687 // outside a literal, then we need to throw in an arbitrary space that 01688 // is required by XML. 01689 // 01690 if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) 01691 fCharBuf[fCharsAvail++] = chSpace; 01692 01693 // Calculate fCharOfsBuf buffer using the elements from fCharBufSize 01694 if (fCalculateSrcOfs) 01695 { 01696 fCharOfsBuf[0] = 0; 01697 for (XMLSize_t index = 1; index < fCharsAvail; ++index) { 01698 fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1]; 01699 } 01700 } 01701 } 01702 01703 01704 // 01705 // This method is called internally when we run out of bytes in the raw 01706 // buffer. We just read as many bytes as we can into the raw buffer again 01707 // and store the number of bytes we got. 01708 // 01709 void XMLReader::refreshRawBuffer() 01710 { 01711 // 01712 // If there are any bytes left, move them down to the start. There 01713 // should only ever be (max bytes per char - 1) at the most. 01714 // 01715 const XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex; 01716 01717 // Move the existing ones down 01718 for (XMLSize_t index = 0; index < bytesLeft; index++) 01719 fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index]; 01720 01721 // 01722 // And then read into the buffer past the existing bytes. Add back in 01723 // that many to the bytes read, and subtract that many from the bytes 01724 // requested. 01725 // 01726 fRawBytesAvail = fStream->readBytes 01727 ( 01728 &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft 01729 ) + bytesLeft; 01730 01731 // 01732 // We need to reset the buffer index back to the start in all cases, 01733 // since any trailing data was copied down to the start. 01734 // 01735 fRawBufIndex = 0; 01736 } 01737 01738 01739 // 01740 // This method is called internally when we run out of characters in the 01741 // trancoded character buffer. We transcode up to another maxChars chars 01742 // from the 01743 // 01744 XMLSize_t 01745 XMLReader::xcodeMoreChars( XMLCh* const bufToFill 01746 , unsigned char* const charSizes 01747 , const XMLSize_t maxChars) 01748 { 01749 XMLSize_t charsDone = 0; 01750 XMLSize_t bytesEaten = 0; 01751 bool needMode = false; 01752 01753 while (!bytesEaten) 01754 { 01755 // If our raw buffer is low, then lets load up another batch of 01756 // raw bytes now. 01757 // 01758 XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex; 01759 if (needMode || bytesLeft == 0 || bytesLeft < fLowWaterMark) 01760 { 01761 refreshRawBuffer(); 01762 01763 // If there are no characters or if we need more but didn't get 01764 // any, return zero now. 01765 // 01766 if (fRawBytesAvail == 0 || 01767 (needMode && (bytesLeft == fRawBytesAvail - fRawBufIndex))) 01768 return 0; 01769 } 01770 01771 // Ask the transcoder to internalize another batch of chars. It is 01772 // possible that there is data in the raw buffer but the transcoder 01773 // is unable to produce anything because transcoding of multi-byte 01774 // encodings may have left a few bytes representing a partial 01775 // character in the buffer that can't be used until the next chunk 01776 // (and the rest of the character) is read. In this case set the 01777 // needMore flag and try again. 01778 // 01779 01780 charsDone = fTranscoder->transcodeFrom 01781 ( 01782 &fRawByteBuf[fRawBufIndex] 01783 , fRawBytesAvail - fRawBufIndex 01784 , bufToFill 01785 , maxChars 01786 , bytesEaten 01787 , charSizes 01788 ); 01789 01790 if (bytesEaten == 0) 01791 needMode = true; 01792 else 01793 fRawBufIndex += bytesEaten; 01794 } 01795 01796 return charsDone; 01797 } 01798 01799 /*** 01800 * 01801 * XML1.1 01802 * 01803 * 2.11 End-of-Line Handling 01804 * 01805 * XML parsed entities are often stored in computer files which, for editing 01806 * convenience, are organized into lines. These lines are typically separated 01807 * by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). 01808 * 01809 * To simplify the tasks of applications, the XML processor MUST behave as if 01810 * it normalized all line breaks in external parsed entities (including the document 01811 * entity) on input, before parsing, by translating all of the following to a single 01812 * #xA character: 01813 * 01814 * 1. the two-character sequence #xD #xA 01815 * 2. the two-character sequence #xD #x85 01816 * 3. the single character #x85 01817 * 4. the single character #x2028 01818 * 5. any #xD character that is not immediately followed by #xA or #x85. 01819 * 01820 * 01821 ***/ 01822 void XMLReader::handleEOL(XMLCh& curCh, bool inDecl) 01823 { 01824 // 1. the two-character sequence #xD #xA 01825 // 2. the two-character sequence #xD #x85 01826 // 5. any #xD character that is not immediately followed by #xA or #x85. 01827 switch(curCh) 01828 { 01829 case chCR: 01830 fCurCol = 1; 01831 fCurLine++; 01832 01833 // 01834 // If not already internalized, then convert it to an 01835 // LF and eat any following LF. 01836 // 01837 if (fSource == Source_External) 01838 { 01839 if ((fCharIndex < fCharsAvail) || refreshCharBuffer()) 01840 { 01841 if ( fCharBuf[fCharIndex] == chLF || 01842 ((fCharBuf[fCharIndex] == chNEL) && fNEL) ) 01843 { 01844 fCharIndex++; 01845 } 01846 } 01847 curCh = chLF; 01848 } 01849 break; 01850 01851 case chLF: 01852 fCurCol = 1; 01853 fCurLine++; 01854 break; 01855 01856 // 3. the single character #x85 01857 // 4. the single character #x2028 01858 case chNEL: 01859 case chLineSeparator: 01860 if (inDecl && fXMLVersion == XMLV1_1) 01861 { 01862 01863 /*** 01864 * XML1.1 01865 * 01866 * 2.11 End-of-Line Handling 01867 * ... 01868 * The characters #x85 and #x2028 cannot be reliably recognized and translated 01869 * until an entity's encoding declaration (if present) has been read. 01870 * Therefore, it is a fatal error to use them within the XML declaration or 01871 * text declaration. 01872 * 01873 ***/ 01874 ThrowXMLwithMemMgr1 01875 ( 01876 TranscodingException 01877 , XMLExcepts::Reader_NelLsepinDecl 01878 , fSystemId 01879 , fMemoryManager 01880 ); 01881 } 01882 01883 if (fNEL && fSource == Source_External) 01884 { 01885 fCurCol = 1; 01886 fCurLine++; 01887 curCh = chLF; 01888 } 01889 break; 01890 default: 01891 fCurCol++; 01892 } 01893 } 01894 01895 XERCES_CPP_NAMESPACE_END