GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: ICUTransService.cpp 901107 2010-01-20 08:45:02Z borisk $ 00020 */ 00021 00022 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/util/PlatformUtils.hpp> 00027 #include <xercesc/util/Janitor.hpp> 00028 #include <xercesc/util/TranscodingException.hpp> 00029 #include <xercesc/util/XMLString.hpp> 00030 #include <xercesc/util/XMLUniDefs.hpp> 00031 #include "ICUTransService.hpp" 00032 #include <string.h> 00033 #include <unicode/uloc.h> 00034 #include <unicode/uchar.h> 00035 #include <unicode/ucnv.h> 00036 #include <unicode/ucnv_err.h> 00037 #include <unicode/ustring.h> 00038 #include <unicode/udata.h> 00039 #if (U_ICU_VERSION_MAJOR_NUM >= 2) 00040 #include <unicode/uclean.h> 00041 #endif 00042 00043 #if !defined(XML_OS390) && !defined(XML_AS400) && !defined(XML_HPUX) && !defined(XML_PTX) 00044 // Forward reference the symbol which points to the ICU converter data. 00045 #if (U_ICU_VERSION_MAJOR_NUM < 2) 00046 extern "C" const uint8_t U_IMPORT icudata_dat[]; 00047 #endif 00048 #endif 00049 00050 #if !defined(U16_NEXT_UNSAFE) && defined(UTF16_NEXT_CHAR_UNSAFE) 00051 #define U16_NEXT_UNSAFE UTF16_NEXT_CHAR_UNSAFE 00052 #endif 00053 00054 #if !defined(U16_APPEND_UNSAFE) && defined(UTF16_APPEND_CHAR_UNSAFE) 00055 #define U16_APPEND_UNSAFE UTF16_APPEND_CHAR_UNSAFE 00056 #endif 00057 00058 #if !defined(U_IS_BMP) && defined(UTF16_CHAR_LENGTH) 00059 #define U_IS_BMP(c) (UTF16_CHAR_LENGTH(c)==1) 00060 #endif 00061 00062 00063 XERCES_CPP_NAMESPACE_BEGIN 00064 00065 // --------------------------------------------------------------------------- 00066 // Local, const data 00067 // --------------------------------------------------------------------------- 00068 static const XMLCh gMyServiceId[] = 00069 { 00070 chLatin_I, chLatin_C, chLatin_U, chNull 00071 }; 00072 00073 static const XMLCh gS390Id[] = 00074 { 00075 chLatin_S, chDigit_3, chDigit_9, chDigit_0, chNull 00076 }; 00077 00078 static const XMLCh gs390Id[] = 00079 { 00080 chLatin_s, chDigit_3, chDigit_9, chDigit_0, chNull 00081 }; 00082 00083 static const XMLCh gswaplfnlId[] = 00084 { 00085 chComma, chLatin_s, chLatin_w, chLatin_a, chLatin_p, 00086 chLatin_l, chLatin_f, chLatin_n, chLatin_l, chNull 00087 }; 00088 // --------------------------------------------------------------------------- 00089 // Local functions 00090 // --------------------------------------------------------------------------- 00091 00092 // 00093 // When XMLCh and ICU's UChar are not the same size, we have to do a temp 00094 // conversion of all strings. These local helper methods make that easier. 00095 // 00096 static UChar* convertToUChar( const XMLCh* const toConvert 00097 , const XMLSize_t srcLen = 0 00098 , MemoryManager* const manager = 0) 00099 { 00100 const XMLSize_t actualLen = srcLen 00101 ? srcLen : XMLString::stringLen(toConvert); 00102 00103 UChar* tmpBuf = (manager) 00104 ? (UChar*) manager->allocate((actualLen + 1) * sizeof(UChar)) 00105 : new UChar[actualLen + 1]; 00106 const XMLCh* srcPtr = toConvert; 00107 UChar* outPtr = tmpBuf; 00108 while (*srcPtr) 00109 *outPtr++ = UChar(*srcPtr++); 00110 *outPtr = 0; 00111 00112 return tmpBuf; 00113 } 00114 00115 00116 static XMLCh* convertToXMLCh( const UChar* const toConvert, 00117 MemoryManager* const manager = 0) 00118 { 00119 const unsigned int srcLen = u_strlen(toConvert); 00120 XMLCh* retBuf = (manager) 00121 ? (XMLCh*) manager->allocate((srcLen+1) * sizeof(XMLCh)) 00122 : new XMLCh[srcLen + 1]; 00123 00124 XMLCh* outPtr = retBuf; 00125 const UChar* srcPtr = toConvert; 00126 while (*srcPtr) 00127 *outPtr++ = XMLCh(*srcPtr++); 00128 *outPtr = 0; 00129 00130 return retBuf; 00131 } 00132 00133 00134 00135 00136 // --------------------------------------------------------------------------- 00137 // ICUTransService: Constructors and Destructor 00138 // --------------------------------------------------------------------------- 00139 ICUTransService::ICUTransService(MemoryManager*) 00140 { 00141 // Starting with ICU 3.4 we don't need to call init anymore. 00142 // 00143 #if (U_ICU_VERSION_MAJOR_NUM > 2 || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >= 6)) && \ 00144 (U_ICU_VERSION_MAJOR_NUM < 3 || (U_ICU_VERSION_MAJOR_NUM == 3 && U_ICU_VERSION_MINOR_NUM < 4)) 00145 UErrorCode errorCode=U_ZERO_ERROR; 00146 u_init(&errorCode); 00147 if(U_FAILURE(errorCode)) { 00148 XMLPlatformUtils::panic(PanicHandler::Panic_NoTransService); 00149 } 00150 #endif 00151 00152 #if !defined(XML_OS390) && !defined(XML_AS400) && !defined(XML_HPUX) && !defined(XML_PTX) 00153 #if (U_ICU_VERSION_MAJOR_NUM < 2) 00154 // Starting with ICU 2.0, ICU itself includes a static reference to the data 00155 // entrypoint symbol. 00156 // 00157 // ICU 1.8 (and previous) did not include a static reference, but would 00158 // dynamically load the data dll when it was first needed, however this dynamic 00159 // loading proved unreliable in some of the odd environments that Xerces needed 00160 // to run in. Hence, the static reference. 00161 00162 // Pass the location of the converter data to ICU. By doing so, we are 00163 // forcing the load of ICU converter data DLL, after the Xerces-C DLL is 00164 // loaded. This implies that Xerces-C, now has to explicitly link with the 00165 // ICU converter dll. However, the advantage is that we no longer depend 00166 // on the code which does demand dynamic loading of DLL's. The demand 00167 // loading is highly system dependent and was a constant source of support 00168 // calls. 00169 UErrorCode uerr = U_ZERO_ERROR; 00170 udata_setCommonData((void *) icudata_dat, &uerr); 00171 #endif 00172 #endif 00173 } 00174 00175 ICUTransService::~ICUTransService() 00176 { 00177 /* 00178 * commented out the following clean up code 00179 * in case users use ICU outside of the parser 00180 * if we clean up here, users' code may crash 00181 * 00182 #if (U_ICU_VERSION_MAJOR_NUM >= 2) 00183 // release all lazily allocated data 00184 u_cleanup(); 00185 #endif 00186 */ 00187 } 00188 00189 00190 // --------------------------------------------------------------------------- 00191 // ICUTransService: The virtual transcoding service API 00192 // --------------------------------------------------------------------------- 00193 int ICUTransService::compareIString(const XMLCh* const comp1 00194 , const XMLCh* const comp2) 00195 { 00196 size_t i = 0; 00197 size_t j = 0; 00198 00199 for(;;) 00200 { 00201 UChar32 ch1; 00202 UChar32 ch2; 00203 00204 U16_NEXT_UNSAFE(comp1, i, ch1); 00205 U16_NEXT_UNSAFE(comp2, j, ch2); 00206 00207 const UChar32 folded1 = 00208 u_foldCase(ch1, U_FOLD_CASE_DEFAULT); 00209 00210 const UChar32 folded2 = 00211 u_foldCase(ch2, U_FOLD_CASE_DEFAULT); 00212 00213 if (folded1 != 00214 folded2) 00215 { 00216 return folded1 - folded2; 00217 } 00218 else if (ch1 == 0) 00219 { 00220 // If ch1 is 0, the ch2 must also be 00221 // 0. Otherwise, the previous if 00222 // would have failed. 00223 break; 00224 } 00225 } 00226 00227 return 0; 00228 } 00229 00230 00231 int ICUTransService::compareNIString(const XMLCh* const comp1 00232 , const XMLCh* const comp2 00233 , const XMLSize_t maxChars) 00234 { 00235 if (maxChars > 0) 00236 { 00237 // Note that this function has somewhat broken semantics, as it's 00238 // possible for two strings of different lengths to compare as equal 00239 // in a case-insensitive manner, since one character could be 00240 // represented as a surrogate pair. 00241 size_t i = 0; 00242 size_t j = 0; 00243 00244 for(;;) 00245 { 00246 UChar32 ch1; 00247 UChar32 ch2; 00248 00249 U16_NEXT_UNSAFE(comp1, i, ch1); 00250 U16_NEXT_UNSAFE(comp2, j, ch2); 00251 00252 const UChar32 folded1 = 00253 u_foldCase(ch1, U_FOLD_CASE_DEFAULT); 00254 00255 const UChar32 folded2 = 00256 u_foldCase(ch2, U_FOLD_CASE_DEFAULT); 00257 00258 if (folded1 != folded2) 00259 { 00260 return folded1 - folded2; 00261 } 00262 else if (i == maxChars) 00263 { 00264 // If we're at the end of both strings, return 0. 00265 // Otherwise, we've run out of characters in the 00266 // left string, so return -1. 00267 return j == maxChars ? 0 : -1; 00268 } 00269 else if (j == maxChars) 00270 { 00271 // We've run out of characters in the right string, 00272 // but not the left, so return 1. 00273 return 1; 00274 } 00275 } 00276 } 00277 00278 return 0; 00279 } 00280 00281 00282 const XMLCh* ICUTransService::getId() const 00283 { 00284 return gMyServiceId; 00285 } 00286 00287 XMLLCPTranscoder* ICUTransService::makeNewLCPTranscoder(MemoryManager* manager) 00288 { 00289 // 00290 // Try to create a default converter. If it fails, return a null 00291 // pointer which will basically cause the system to give up because 00292 // we really can't do anything without one. 00293 // 00294 UErrorCode uerr = U_ZERO_ERROR; 00295 UConverter* converter = ucnv_open(NULL, &uerr); 00296 if (!converter) 00297 return 0; 00298 00299 // That went ok, so create an ICU LCP transcoder wrapper and return it 00300 return new (manager) ICULCPTranscoder(converter); 00301 } 00302 00303 00304 bool ICUTransService::supportsSrcOfs() const 00305 { 00306 // This implementation supports source offset information 00307 return true; 00308 } 00309 00310 00311 template <class FunctionType> 00312 static void doCaseConvert(XMLCh* convertString, 00313 FunctionType caseFunction) 00314 { 00315 // Note the semantics of this function are broken, since it's 00316 // possible that changing the case of a string could increase 00317 // its length, but there's no way to handle such a situation. 00318 const XMLSize_t len = XMLString::stringLen(convertString); 00319 00320 size_t readPos = 0; 00321 size_t writePos = 0; 00322 00323 while(readPos < len) 00324 { 00325 UChar32 original; 00326 00327 // Get the next Unicode code point. 00328 U16_NEXT_UNSAFE(convertString, readPos, original); 00329 00330 // Convert the code point 00331 const UChar32 converted = caseFunction(original); 00332 00333 // OK, now here's where it gets ugly. 00334 if (!U_IS_BMP(converted) && U_IS_BMP(original) && 00335 readPos - writePos == 1) 00336 { 00337 // We do not have room to convert the 00338 // character without overwriting the next 00339 // character, so we will just stop. 00340 break; 00341 } 00342 else 00343 { 00344 U16_APPEND_UNSAFE(convertString, writePos, converted); 00345 } 00346 } 00347 00348 convertString[writePos] = 0; 00349 } 00350 00351 00352 00353 void ICUTransService::upperCase(XMLCh* const toUpperCase) 00354 { 00355 doCaseConvert(toUpperCase, u_toupper); 00356 } 00357 00358 void ICUTransService::lowerCase(XMLCh* const toLowerCase) 00359 { 00360 doCaseConvert(toLowerCase, u_tolower); 00361 } 00362 00363 00364 00365 // --------------------------------------------------------------------------- 00366 // ICUTransService: The protected virtual transcoding service API 00367 // --------------------------------------------------------------------------- 00368 XMLTranscoder* ICUTransService:: 00369 makeNewXMLTranscoder(const XMLCh* const encodingName 00370 , XMLTransService::Codes& resValue 00371 , const XMLSize_t blockSize 00372 , MemoryManager* const manager) 00373 { 00374 // 00375 // For encodings that end with "s390" we need to strip off the "s390" 00376 // from the encoding name and add ",swaplfnl" to the encoding name 00377 // that we pass into ICU on the ucnv_openU. 00378 // 00379 XMLCh* encodingNameToUse = (XMLCh*) encodingName; 00380 XMLCh* workBuffer = 0; 00381 00382 if ( (XMLString::endsWith(encodingNameToUse, gs390Id)) || 00383 (XMLString::endsWith(encodingNameToUse, gS390Id)) ) 00384 { 00385 XMLSize_t workBufferSize = (XMLString::stringLen(encodingNameToUse) + XMLString::stringLen(gswaplfnlId) - XMLString::stringLen(gS390Id) + 1); 00386 workBuffer = (XMLCh*) manager->allocate(workBufferSize * sizeof(XMLCh)); 00387 XMLSize_t moveSize = XMLString::stringLen(encodingNameToUse) - XMLString::stringLen(gS390Id); 00388 XMLString::moveChars(workBuffer, encodingNameToUse, moveSize); 00389 XMLString::moveChars((workBuffer + moveSize), gswaplfnlId, XMLString::stringLen(gswaplfnlId)); 00390 encodingNameToUse = workBuffer; 00391 } 00392 00393 // 00394 // If UChar and XMLCh are not the same size, then we have premassage the 00395 // encoding name into a UChar type string. 00396 // 00397 const UChar* actualName; 00398 UChar* tmpName = 0; 00399 if (sizeof(UChar) == sizeof(XMLCh)) 00400 { 00401 actualName = (const UChar*)encodingNameToUse; 00402 } 00403 else 00404 { 00405 tmpName = convertToUChar(encodingNameToUse, 0, manager); 00406 actualName = tmpName; 00407 } 00408 00409 ArrayJanitor<UChar> janTmp(tmpName, manager); 00410 ArrayJanitor<XMLCh> janTmp1(workBuffer, manager); 00411 00412 UErrorCode uerr = U_ZERO_ERROR; 00413 UConverter* converter = ucnv_openU(actualName, &uerr); 00414 if (!converter) 00415 { 00416 resValue = XMLTransService::UnsupportedEncoding; 00417 return 0; 00418 } 00419 00420 return new (manager) ICUTranscoder(encodingName, converter, blockSize, manager); 00421 } 00422 00423 00424 00425 00426 // --------------------------------------------------------------------------- 00427 // ICUTranscoder: Constructors and Destructor 00428 // --------------------------------------------------------------------------- 00429 ICUTranscoder::ICUTranscoder(const XMLCh* const encodingName 00430 , UConverter* const toAdopt 00431 , const XMLSize_t blockSize 00432 , MemoryManager* const manager) : 00433 00434 XMLTranscoder(encodingName, blockSize, manager) 00435 , fConverter(toAdopt) 00436 , fFixed(false) 00437 , fSrcOffsets(0) 00438 { 00439 // If there is a block size, then allocate our source offset array 00440 if (blockSize) 00441 fSrcOffsets = (XMLUInt32*) manager->allocate 00442 ( 00443 blockSize * sizeof(XMLUInt32) 00444 );//new XMLUInt32[blockSize]; 00445 00446 // Remember if its a fixed size encoding 00447 fFixed = (ucnv_getMaxCharSize(fConverter) == ucnv_getMinCharSize(fConverter)); 00448 } 00449 00450 ICUTranscoder::~ICUTranscoder() 00451 { 00452 getMemoryManager()->deallocate(fSrcOffsets);//delete [] fSrcOffsets; 00453 00454 // If there is a converter, ask ICU to clean it up 00455 if (fConverter) 00456 { 00457 // <TBD> Does this actually delete the structure??? 00458 ucnv_close(fConverter); 00459 fConverter = 0; 00460 } 00461 } 00462 00463 00464 // --------------------------------------------------------------------------- 00465 // ICUTranscoder: The virtual transcoder API 00466 // --------------------------------------------------------------------------- 00467 XMLSize_t 00468 ICUTranscoder::transcodeFrom(const XMLByte* const srcData 00469 , const XMLSize_t srcCount 00470 , XMLCh* const toFill 00471 , const XMLSize_t maxChars 00472 , XMLSize_t& bytesEaten 00473 , unsigned char* const charSizes) 00474 { 00475 // Set up pointers to the start and end of the source buffer 00476 const XMLByte* startSrc = srcData; 00477 const XMLByte* endSrc = srcData + srcCount; 00478 00479 // 00480 // And now do the target buffer. This works differently according to 00481 // whether XMLCh and UChar are the same size or not. 00482 // 00483 UChar* startTarget; 00484 if (sizeof(XMLCh) == sizeof(UChar)) 00485 startTarget = (UChar*)toFill; 00486 else 00487 startTarget = (UChar*) getMemoryManager()->allocate 00488 ( 00489 maxChars * sizeof(UChar) 00490 );//new UChar[maxChars]; 00491 UChar* orgTarget = startTarget; 00492 00493 // 00494 // Transcode the buffer. Buffer overflow errors are normal, occuring 00495 // when the raw input buffer holds more characters than will fit in 00496 // the Unicode output buffer. 00497 // 00498 UErrorCode err = U_ZERO_ERROR; 00499 ucnv_toUnicode 00500 ( 00501 fConverter 00502 , &startTarget 00503 , startTarget + maxChars 00504 , (const char**)&startSrc 00505 , (const char*)endSrc 00506 , (fFixed ? 0 : (int32_t*)fSrcOffsets) 00507 , false 00508 , &err 00509 ); 00510 00511 if ((err != U_ZERO_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR)) 00512 { 00513 if (orgTarget != (UChar*)toFill) 00514 getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget; 00515 00516 if (fFixed) 00517 { 00518 XMLCh tmpBuf[17]; 00519 XMLString::binToText((unsigned int)(*startTarget), tmpBuf, 16, 16, getMemoryManager()); 00520 ThrowXMLwithMemMgr2 00521 ( 00522 TranscodingException 00523 , XMLExcepts::Trans_BadSrcCP 00524 , tmpBuf 00525 , getEncodingName() 00526 , getMemoryManager() 00527 ); 00528 } 00529 else 00530 { 00531 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager()); 00532 } 00533 } 00534 00535 // Calculate the bytes eaten and store in caller's param 00536 bytesEaten = startSrc - srcData; 00537 00538 // And the characters decoded 00539 const XMLSize_t charsDecoded = startTarget - orgTarget; 00540 00541 // 00542 // Translate the array of char offsets into an array of character 00543 // sizes, which is what the transcoder interface semantics requires. 00544 // If its fixed, then we can optimize it. 00545 // 00546 if (fFixed) 00547 { 00548 const unsigned char fillSize = (unsigned char)ucnv_getMaxCharSize(fConverter); 00549 memset(charSizes, fillSize, maxChars); 00550 } 00551 else 00552 { 00553 // 00554 // We have to convert the series of offsets into a series of 00555 // sizes. If just one char was decoded, then its the total bytes 00556 // eaten. Otherwise, do a loop and subtract out each element from 00557 // its previous element. 00558 // 00559 if (charsDecoded == 1) 00560 { 00561 charSizes[0] = (unsigned char)bytesEaten; 00562 } 00563 else 00564 { 00565 // ICU does not return an extra element to allow us to figure 00566 // out the last char size, so we have to compute it from the 00567 // total bytes used. 00568 unsigned int index; 00569 for (index = 0; index < charsDecoded - 1; index++) 00570 { 00571 charSizes[index] = (unsigned char)(fSrcOffsets[index + 1] 00572 - fSrcOffsets[index]); 00573 } 00574 if( charsDecoded > 0 ) { 00575 charSizes[charsDecoded - 1] = (unsigned char)(bytesEaten 00576 - fSrcOffsets[charsDecoded - 1]); 00577 } 00578 } 00579 } 00580 00581 // 00582 // If XMLCh and UChar are not the same size, then we need to copy over 00583 // the temp buffer to the new one. 00584 // 00585 if (sizeof(UChar) != sizeof(XMLCh)) 00586 { 00587 XMLCh* outPtr = toFill; 00588 startTarget = orgTarget; 00589 for (unsigned int index = 0; index < charsDecoded; index++) 00590 *outPtr++ = XMLCh(*startTarget++); 00591 00592 // And delete the temp buffer 00593 getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget; 00594 } 00595 00596 // Return the chars we put into the target buffer 00597 return charsDecoded; 00598 } 00599 00600 00601 XMLSize_t 00602 ICUTranscoder::transcodeTo( const XMLCh* const srcData 00603 , const XMLSize_t srcCount 00604 , XMLByte* const toFill 00605 , const XMLSize_t maxBytes 00606 , XMLSize_t& charsEaten 00607 , const UnRepOpts options) 00608 { 00609 // 00610 // Get a pointer to the buffer to transcode. If UChar and XMLCh are 00611 // the same size here, then use the original. Else, create a temp 00612 // one and put a janitor on it. 00613 // 00614 const UChar* srcPtr; 00615 UChar* tmpBufPtr = 0; 00616 if (sizeof(XMLCh) == sizeof(UChar)) 00617 { 00618 srcPtr = (const UChar*)srcData; 00619 } 00620 else 00621 { 00622 tmpBufPtr = convertToUChar(srcData, srcCount, getMemoryManager()); 00623 srcPtr = tmpBufPtr; 00624 } 00625 ArrayJanitor<UChar> janTmpBuf(tmpBufPtr, getMemoryManager()); 00626 00627 // 00628 // Set the appropriate callback so that it will either fail or use 00629 // the rep char. Remember the old one so we can put it back. 00630 // 00631 UErrorCode err = U_ZERO_ERROR; 00632 UConverterFromUCallback oldCB = NULL; 00633 #if (U_ICU_VERSION_MAJOR_NUM < 2) 00634 void* orgContent; 00635 #else 00636 const void* orgContent; 00637 #endif 00638 ucnv_setFromUCallBack 00639 ( 00640 fConverter 00641 , (options == UnRep_Throw) ? UCNV_FROM_U_CALLBACK_STOP 00642 : UCNV_FROM_U_CALLBACK_SUBSTITUTE 00643 , NULL 00644 , &oldCB 00645 , &orgContent 00646 , &err 00647 ); 00648 00649 // 00650 // Ok, lets transcode as many chars as we we can in one shot. The 00651 // ICU API gives enough info not to have to do this one char by char. 00652 // 00653 XMLByte* startTarget = toFill; 00654 const UChar* startSrc = srcPtr; 00655 err = U_ZERO_ERROR; 00656 ucnv_fromUnicode 00657 ( 00658 fConverter 00659 , (char**)&startTarget 00660 , (char*)(startTarget + maxBytes) 00661 , &startSrc 00662 , srcPtr + srcCount 00663 , 0 00664 , false 00665 , &err 00666 ); 00667 00668 // Rememember the status before we possibly overite the error code 00669 const bool res = (err == U_ZERO_ERROR); 00670 00671 // Put the old handler back 00672 err = U_ZERO_ERROR; 00673 UConverterFromUCallback orgAction = NULL; 00674 00675 ucnv_setFromUCallBack(fConverter, oldCB, NULL, &orgAction, &orgContent, &err); 00676 00677 if (!res) 00678 { 00679 XMLCh tmpBuf[17]; 00680 XMLString::binToText((unsigned int)*startSrc, tmpBuf, 16, 16, getMemoryManager()); 00681 ThrowXMLwithMemMgr2 00682 ( 00683 TranscodingException 00684 , XMLExcepts::Trans_Unrepresentable 00685 , tmpBuf 00686 , getEncodingName() 00687 , getMemoryManager() 00688 ); 00689 } 00690 00691 // Fill in the chars we ate from the input 00692 charsEaten = startSrc - srcPtr; 00693 00694 // Return the chars we stored 00695 return startTarget - toFill; 00696 } 00697 00698 00699 bool ICUTranscoder::canTranscodeTo(const unsigned int toCheck) 00700 { 00701 // 00702 // If the passed value is really a surrogate embedded together, then 00703 // we need to break it out into its two chars. Else just one. While 00704 // we are ate it, convert them to UChar format if required. 00705 // 00706 UChar srcBuf[2]; 00707 unsigned int srcCount = 1; 00708 if (toCheck & 0xFFFF0000) 00709 { 00710 srcBuf[0] = UChar((toCheck >> 10) + 0xD800); 00711 srcBuf[1] = UChar(toCheck & 0x3FF) + 0xDC00; 00712 srcCount++; 00713 } 00714 else 00715 { 00716 srcBuf[0] = UChar(toCheck); 00717 } 00718 00719 // 00720 // Set the callback so that it will fail instead of using the rep char. 00721 // Remember the old one so we can put it back. 00722 // 00723 UErrorCode err = U_ZERO_ERROR; 00724 UConverterFromUCallback oldCB = NULL; 00725 #if (U_ICU_VERSION_MAJOR_NUM < 2) 00726 void* orgContent; 00727 #else 00728 const void* orgContent; 00729 #endif 00730 00731 ucnv_setFromUCallBack 00732 ( 00733 fConverter 00734 , UCNV_FROM_U_CALLBACK_STOP 00735 , NULL 00736 , &oldCB 00737 , &orgContent 00738 , &err 00739 ); 00740 00741 // Set upa temp buffer to format into. Make it more than big enough 00742 char tmpBuf[64]; 00743 char* startTarget = tmpBuf; 00744 const UChar* startSrc = srcBuf; 00745 00746 err = U_ZERO_ERROR; 00747 ucnv_fromUnicode 00748 ( 00749 fConverter 00750 , &startTarget 00751 , startTarget + 64 00752 , &startSrc 00753 , srcBuf + srcCount 00754 , 0 00755 , false 00756 , &err 00757 ); 00758 00759 // Save the result before we overight the error code 00760 const bool res = (err == U_ZERO_ERROR); 00761 00762 // Put the old handler back 00763 err = U_ZERO_ERROR; 00764 UConverterFromUCallback orgAction = NULL; 00765 00766 ucnv_setFromUCallBack(fConverter, oldCB, NULL, &orgAction, &orgContent, &err); 00767 00768 return res; 00769 } 00770 00771 00772 00773 // --------------------------------------------------------------------------- 00774 // ICULCPTranscoder: Constructors and Destructor 00775 // --------------------------------------------------------------------------- 00776 ICULCPTranscoder::ICULCPTranscoder(UConverter* const toAdopt) : 00777 00778 fConverter(toAdopt) 00779 { 00780 } 00781 00782 ICULCPTranscoder::~ICULCPTranscoder() 00783 { 00784 // If there is a converter, ask ICU to clean it up 00785 if (fConverter) 00786 { 00787 // <TBD> Does this actually delete the structure??? 00788 ucnv_close(fConverter); 00789 fConverter = 0; 00790 } 00791 } 00792 00793 00794 // --------------------------------------------------------------------------- 00795 // ICULCPTranscoder: Constructors and Destructor 00796 // --------------------------------------------------------------------------- 00797 XMLSize_t ICULCPTranscoder::calcRequiredSize(const XMLCh* const srcText 00798 , MemoryManager* const manager) 00799 { 00800 if (!srcText) 00801 return 0; 00802 00803 // 00804 // We do two different versions of this, according to whether XMLCh 00805 // is the same size as UChar or not. 00806 // 00807 UErrorCode err = U_ZERO_ERROR; 00808 int32_t targetCap; 00809 if (sizeof(XMLCh) == sizeof(UChar)) 00810 { 00811 // Use a faux scope to synchronize while we do this 00812 { 00813 XMLMutexLock lockConverter(&fMutex); 00814 00815 targetCap = ucnv_fromUChars 00816 ( 00817 fConverter 00818 , 0 00819 , 0 00820 , (const UChar*)srcText 00821 , -1 00822 , &err 00823 ); 00824 } 00825 } 00826 else 00827 { 00828 // Copy the source to a local temp 00829 UChar* tmpBuf = convertToUChar(srcText, 0, manager); 00830 ArrayJanitor<UChar> janTmp(tmpBuf, manager); 00831 00832 // Use a faux scope to synchronize while we do this 00833 { 00834 XMLMutexLock lockConverter(&fMutex); 00835 00836 targetCap = ucnv_fromUChars 00837 ( 00838 fConverter 00839 , 0 00840 , 0 00841 , tmpBuf 00842 , -1 00843 , &err 00844 ); 00845 } 00846 } 00847 00848 if (err != U_BUFFER_OVERFLOW_ERROR) 00849 return 0; 00850 00851 return (XMLSize_t)targetCap; 00852 } 00853 00854 XMLSize_t ICULCPTranscoder::calcRequiredSize(const char* const srcText 00855 , MemoryManager* const /*manager*/) 00856 { 00857 if (!srcText) 00858 return 0; 00859 00860 int32_t targetCap; 00861 UErrorCode err = U_ZERO_ERROR; 00862 00863 // Use a faux scope to synchronize while we do this 00864 { 00865 XMLMutexLock lockConverter(&fMutex); 00866 targetCap = ucnv_toUChars 00867 ( 00868 fConverter 00869 , 0 00870 , 0 00871 , srcText 00872 , (int32_t)strlen(srcText) 00873 , &err 00874 ); 00875 } 00876 00877 if (err != U_BUFFER_OVERFLOW_ERROR) 00878 return 0; 00879 00880 #if (U_ICU_VERSION_MAJOR_NUM < 2) 00881 // Subtract one since it includes the terminator space 00882 return (XMLSize_t)(targetCap - 1); 00883 #else 00884 // Starting ICU 2.0, this is fixed and all ICU String functions have consistent NUL-termination behavior. 00885 // The returned length is always the number of output UChar's, not counting an additional, terminating NUL. 00886 return (XMLSize_t)(targetCap); 00887 #endif 00888 } 00889 00890 00891 char* ICULCPTranscoder::transcode(const XMLCh* const toTranscode, 00892 MemoryManager* const manager) 00893 { 00894 char* retBuf = 0; 00895 00896 // Check for a couple of special cases 00897 if (!toTranscode) 00898 return retBuf; 00899 00900 if (!*toTranscode) 00901 { 00902 retBuf = (char*) manager->allocate(sizeof(char));//new char[1]; 00903 retBuf[0] = 0; 00904 return retBuf; 00905 } 00906 00907 // 00908 // Get the length of the source string since we'll have to use it in 00909 // a couple places below. 00910 // 00911 const XMLSize_t srcLen = XMLString::stringLen(toTranscode); 00912 00913 // 00914 // If XMLCh and UChar are not the same size, then we have to make a 00915 // temp copy of the text to pass to ICU. 00916 // 00917 const UChar* actualSrc; 00918 UChar* ncActual = 0; 00919 if (sizeof(XMLCh) == sizeof(UChar)) 00920 { 00921 actualSrc = (const UChar*)toTranscode; 00922 } 00923 else 00924 { 00925 // Allocate a non-const temp buf, but store it also in the actual 00926 ncActual = convertToUChar(toTranscode, 0, manager); 00927 actualSrc = ncActual; 00928 } 00929 00930 // Insure that the temp buffer, if any, gets cleaned up via the nc pointer 00931 ArrayJanitor<UChar> janTmp(ncActual, manager); 00932 00933 // Caculate a return buffer size not too big, but less likely to overflow 00934 int32_t targetLen = (int32_t)(srcLen * 1.25); 00935 00936 // Allocate the return buffer 00937 retBuf = (char*) manager->allocate((targetLen + 1) * sizeof(char));//new char[targetLen + 1]; 00938 00939 // 00940 // Lock now while we call the converter. Use a faux block to do the 00941 // lock so that it unlocks immediately afterwards. 00942 // 00943 UErrorCode err = U_ZERO_ERROR; 00944 int32_t targetCap; 00945 { 00946 XMLMutexLock lockConverter(&fMutex); 00947 00948 targetCap = ucnv_fromUChars 00949 ( 00950 fConverter 00951 , retBuf 00952 , targetLen + 1 00953 , actualSrc 00954 , -1 00955 , &err 00956 ); 00957 } 00958 00959 // If targetLen is not enough then buffer overflow might occur 00960 if ((err == U_BUFFER_OVERFLOW_ERROR) || (err == U_STRING_NOT_TERMINATED_WARNING)) 00961 { 00962 // 00963 // Reset the error, delete the old buffer, allocate a new one, 00964 // and try again. 00965 // 00966 err = U_ZERO_ERROR; 00967 manager->deallocate(retBuf);//delete [] retBuf; 00968 retBuf = (char*) manager->allocate((targetCap + 1) * sizeof(char));//new char[targetCap + 1]; 00969 00970 // Lock again before we retry 00971 XMLMutexLock lockConverter(&fMutex); 00972 targetCap = ucnv_fromUChars 00973 ( 00974 fConverter 00975 , retBuf 00976 , targetCap + 1 00977 , actualSrc 00978 , -1 00979 , &err 00980 ); 00981 } 00982 00983 if (U_FAILURE(err)) 00984 { 00985 manager->deallocate(retBuf);//delete [] retBuf; 00986 return 0; 00987 } 00988 00989 return retBuf; 00990 } 00991 00992 XMLCh* ICULCPTranscoder::transcode(const char* const toTranscode, 00993 MemoryManager* const manager) 00994 { 00995 // Watch for a few pyscho corner cases 00996 if (!toTranscode) 00997 return 0; 00998 00999 if (!*toTranscode) 01000 { 01001 XMLCh* retVal = (XMLCh*) manager->allocate(sizeof(XMLCh));//new XMLCh[1]; 01002 retVal[0] = 0; 01003 return retVal; 01004 } 01005 01006 // 01007 // Get the length of the string to transcode. The Unicode string will 01008 // almost always be no more chars than were in the source, so this is 01009 // the best guess as to the storage needed. 01010 // 01011 const int32_t srcLen = (int32_t)strlen(toTranscode); 01012 01013 // We need a target buffer of UChars to fill in 01014 UChar* targetBuf = 0; 01015 01016 // Now lock while we do these calculations 01017 UErrorCode err = U_ZERO_ERROR; 01018 int32_t targetCap; 01019 { 01020 XMLMutexLock lockConverter(&fMutex); 01021 01022 // 01023 // Here we don't know what the target length will be so use 0 and 01024 // expect an U_BUFFER_OVERFLOW_ERROR in which case it'd get resolved 01025 // by the correct capacity value. 01026 // 01027 targetCap = ucnv_toUChars 01028 ( 01029 fConverter 01030 , 0 01031 , 0 01032 , toTranscode 01033 , srcLen 01034 , &err 01035 ); 01036 01037 if (err != U_BUFFER_OVERFLOW_ERROR) 01038 return 0; 01039 01040 err = U_ZERO_ERROR; 01041 targetBuf = (UChar*) manager->allocate((targetCap+1) * sizeof(UChar));//new UChar[targetCap + 1]; 01042 ucnv_toUChars 01043 ( 01044 fConverter 01045 , targetBuf 01046 , targetCap + 1 01047 , toTranscode 01048 , srcLen 01049 , &err 01050 ); 01051 } 01052 01053 if (U_FAILURE(err)) 01054 { 01055 // Clean up if we got anything allocated 01056 manager->deallocate(targetBuf);//delete [] targetBuf; 01057 return 0; 01058 } 01059 01060 // Cap it off to make sure 01061 targetBuf[targetCap] = 0; 01062 01063 // 01064 // If XMLCh and UChar are the same size, then we can return retVal 01065 // as is. Else, we have to allocate another buffer and copy the data 01066 // over to it. 01067 // 01068 XMLCh* actualRet; 01069 if (sizeof(XMLCh) == sizeof(UChar)) 01070 { 01071 actualRet = (XMLCh*)targetBuf; 01072 } 01073 else 01074 { 01075 actualRet = convertToXMLCh(targetBuf, manager); 01076 manager->deallocate(targetBuf);//delete [] targetBuf; 01077 } 01078 return actualRet; 01079 } 01080 01081 01082 bool ICULCPTranscoder::transcode(const char* const toTranscode 01083 , XMLCh* const toFill 01084 , const XMLSize_t maxChars 01085 , MemoryManager* const manager) 01086 { 01087 // Check for a couple of psycho corner cases 01088 if (!toTranscode || !maxChars) 01089 { 01090 toFill[0] = 0; 01091 return true; 01092 } 01093 01094 if (!*toTranscode) 01095 { 01096 toFill[0] = 0; 01097 return true; 01098 } 01099 01100 // We'll need this in a couple of places below 01101 const XMLSize_t srcLen = strlen(toTranscode); 01102 01103 // 01104 // Set up the target buffer. If XMLCh and UChar are not the same size 01105 // then we have to use a temp buffer and convert over. 01106 // 01107 UChar* targetBuf; 01108 if (sizeof(XMLCh) == sizeof(UChar)) 01109 targetBuf = (UChar*)toFill; 01110 else 01111 targetBuf = (UChar*) manager->allocate 01112 ( 01113 (maxChars + 1) * sizeof(UChar) 01114 );//new UChar[maxChars + 1]; 01115 01116 // 01117 // Use a faux block to enforce a lock on the converter, which will 01118 // unlock immediately after its completed. 01119 // 01120 UErrorCode err = U_ZERO_ERROR; 01121 { 01122 XMLMutexLock lockConverter(&fMutex); 01123 ucnv_toUChars 01124 ( 01125 fConverter 01126 , targetBuf 01127 , (int32_t)maxChars + 1 01128 , toTranscode 01129 , (int32_t)srcLen 01130 , &err 01131 ); 01132 } 01133 01134 if (U_FAILURE(err)) 01135 { 01136 if (targetBuf != (UChar*)toFill) 01137 manager->deallocate(targetBuf);//delete [] targetBuf; 01138 return false; 01139 } 01140 01141 // If the sizes are not the same, then copy the data over 01142 if (sizeof(XMLCh) != sizeof(UChar)) 01143 { 01144 UChar* srcPtr = targetBuf; 01145 XMLCh* outPtr = toFill; 01146 while (*srcPtr) 01147 *outPtr++ = XMLCh(*srcPtr++); 01148 *outPtr = 0; 01149 01150 // And delete the temp buffer 01151 manager->deallocate(targetBuf);//delete [] targetBuf; 01152 } 01153 01154 return true; 01155 } 01156 01157 01158 bool ICULCPTranscoder::transcode( const XMLCh* const toTranscode 01159 , char* const toFill 01160 , const XMLSize_t maxChars 01161 , MemoryManager* const manager) 01162 { 01163 // Watch for a few psycho corner cases 01164 if (!toTranscode || !maxChars) 01165 { 01166 toFill[0] = 0; 01167 return true; 01168 } 01169 01170 if (!*toTranscode) 01171 { 01172 toFill[0] = 0; 01173 return true; 01174 } 01175 01176 // 01177 // If XMLCh and UChar are not the same size, then we have to make a 01178 // temp copy of the text to pass to ICU. 01179 // 01180 const UChar* actualSrc; 01181 UChar* ncActual = 0; 01182 if (sizeof(XMLCh) == sizeof(UChar)) 01183 { 01184 actualSrc = (const UChar*)toTranscode; 01185 } 01186 else 01187 { 01188 // Allocate a non-const temp buf, but store it also in the actual 01189 ncActual = convertToUChar(toTranscode, 0, manager); 01190 actualSrc = ncActual; 01191 } 01192 01193 // Insure that the temp buffer, if any, gets cleaned up via the nc pointer 01194 ArrayJanitor<UChar> janTmp(ncActual, manager); 01195 01196 // 01197 // Use a faux block to enforce a lock on the converter while we do this. 01198 // It will be released immediately after its done. 01199 // 01200 UErrorCode err = U_ZERO_ERROR; 01201 int32_t targetCap; 01202 { 01203 XMLMutexLock lockConverter(&fMutex); 01204 targetCap = ucnv_fromUChars 01205 ( 01206 fConverter 01207 , toFill 01208 , (int32_t)maxChars 01209 , actualSrc 01210 , -1 01211 , &err 01212 ); 01213 } 01214 01215 if (U_FAILURE(err)) 01216 return false; 01217 01218 toFill[targetCap] = 0; 01219 return true; 01220 } 01221 01222 XERCES_CPP_NAMESPACE_END