GME  13
ICUTransService.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  *
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: ICUTransService.cpp 901107 2010-01-20 08:45:02Z borisk $
00020  */
00021 
00022 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/util/PlatformUtils.hpp>
00027 #include <xercesc/util/Janitor.hpp>
00028 #include <xercesc/util/TranscodingException.hpp>
00029 #include <xercesc/util/XMLString.hpp>
00030 #include <xercesc/util/XMLUniDefs.hpp>
00031 #include "ICUTransService.hpp"
00032 #include <string.h>
00033 #include <unicode/uloc.h>
00034 #include <unicode/uchar.h>
00035 #include <unicode/ucnv.h>
00036 #include <unicode/ucnv_err.h>
00037 #include <unicode/ustring.h>
00038 #include <unicode/udata.h>
00039 #if (U_ICU_VERSION_MAJOR_NUM >= 2)
00040     #include <unicode/uclean.h>
00041 #endif
00042 
00043 #if !defined(XML_OS390) && !defined(XML_AS400) && !defined(XML_HPUX) && !defined(XML_PTX)
00044 // Forward reference the symbol which points to the ICU converter data.
00045 #if (U_ICU_VERSION_MAJOR_NUM < 2)
00046 extern "C" const uint8_t U_IMPORT icudata_dat[];
00047 #endif
00048 #endif
00049 
00050 #if !defined(U16_NEXT_UNSAFE) && defined(UTF16_NEXT_CHAR_UNSAFE)
00051     #define U16_NEXT_UNSAFE UTF16_NEXT_CHAR_UNSAFE
00052 #endif
00053 
00054 #if !defined(U16_APPEND_UNSAFE) && defined(UTF16_APPEND_CHAR_UNSAFE)
00055     #define U16_APPEND_UNSAFE UTF16_APPEND_CHAR_UNSAFE
00056 #endif
00057 
00058 #if !defined(U_IS_BMP) && defined(UTF16_CHAR_LENGTH)
00059     #define U_IS_BMP(c) (UTF16_CHAR_LENGTH(c)==1)
00060 #endif
00061 
00062 
00063 XERCES_CPP_NAMESPACE_BEGIN
00064 
00065 // ---------------------------------------------------------------------------
00066 //  Local, const data
00067 // ---------------------------------------------------------------------------
00068 static const XMLCh gMyServiceId[] =
00069 {
00070     chLatin_I, chLatin_C, chLatin_U, chNull
00071 };
00072 
00073 static const XMLCh gS390Id[] =
00074 {
00075     chLatin_S, chDigit_3, chDigit_9, chDigit_0, chNull
00076 };
00077 
00078 static const XMLCh gs390Id[] =
00079 {
00080     chLatin_s, chDigit_3, chDigit_9, chDigit_0, chNull
00081 };
00082 
00083 static const XMLCh gswaplfnlId[] =
00084 {
00085     chComma, chLatin_s, chLatin_w, chLatin_a, chLatin_p,
00086     chLatin_l, chLatin_f, chLatin_n, chLatin_l, chNull
00087 };
00088 // ---------------------------------------------------------------------------
00089 //  Local functions
00090 // ---------------------------------------------------------------------------
00091 
00092 //
00093 //  When XMLCh and ICU's UChar are not the same size, we have to do a temp
00094 //  conversion of all strings. These local helper methods make that easier.
00095 //
00096 static UChar* convertToUChar( const XMLCh* const   toConvert
00097                             , const XMLSize_t      srcLen = 0
00098                             , MemoryManager* const manager = 0)
00099 {
00100     const XMLSize_t actualLen = srcLen
00101                                    ? srcLen : XMLString::stringLen(toConvert);
00102 
00103     UChar* tmpBuf = (manager)
00104         ? (UChar*) manager->allocate((actualLen + 1) * sizeof(UChar))
00105                 : new UChar[actualLen + 1];
00106     const XMLCh* srcPtr = toConvert;
00107     UChar* outPtr = tmpBuf;
00108     while (*srcPtr)
00109         *outPtr++ = UChar(*srcPtr++);
00110     *outPtr = 0;
00111 
00112     return tmpBuf;
00113 }
00114 
00115 
00116 static XMLCh* convertToXMLCh( const UChar* const toConvert,
00117                             MemoryManager* const manager = 0)
00118 {
00119     const unsigned int srcLen = u_strlen(toConvert);
00120     XMLCh* retBuf = (manager)
00121         ? (XMLCh*) manager->allocate((srcLen+1) * sizeof(XMLCh))
00122         : new XMLCh[srcLen + 1];
00123 
00124     XMLCh* outPtr = retBuf;
00125     const UChar* srcPtr = toConvert;
00126     while (*srcPtr)
00127         *outPtr++ = XMLCh(*srcPtr++);
00128     *outPtr = 0;
00129 
00130     return retBuf;
00131 }
00132 
00133 
00134 
00135 
00136 // ---------------------------------------------------------------------------
00137 //  ICUTransService: Constructors and Destructor
00138 // ---------------------------------------------------------------------------
00139 ICUTransService::ICUTransService(MemoryManager*)
00140 {
00141   // Starting with ICU 3.4 we don't need to call init anymore.
00142   //
00143 #if (U_ICU_VERSION_MAJOR_NUM > 2 || (U_ICU_VERSION_MAJOR_NUM == 2 && U_ICU_VERSION_MINOR_NUM >= 6)) && \
00144   (U_ICU_VERSION_MAJOR_NUM < 3 || (U_ICU_VERSION_MAJOR_NUM == 3 && U_ICU_VERSION_MINOR_NUM < 4))
00145     UErrorCode errorCode=U_ZERO_ERROR;
00146     u_init(&errorCode);
00147     if(U_FAILURE(errorCode)) {
00148         XMLPlatformUtils::panic(PanicHandler::Panic_NoTransService);
00149     }
00150 #endif
00151 
00152 #if !defined(XML_OS390) && !defined(XML_AS400) && !defined(XML_HPUX) && !defined(XML_PTX)
00153 #if (U_ICU_VERSION_MAJOR_NUM < 2)
00154     // Starting with ICU 2.0, ICU itself includes a static reference to the data
00155     // entrypoint symbol.
00156     //
00157     // ICU 1.8 (and previous) did not include a static reference, but would
00158     // dynamically load the data dll when it was first needed, however this dynamic
00159     // loading proved unreliable in some of the odd environments that Xerces needed
00160     // to run in.  Hence, the static reference.
00161 
00162     // Pass the location of the converter data to ICU. By doing so, we are
00163     // forcing the load of ICU converter data DLL, after the Xerces-C DLL is
00164     // loaded. This implies that Xerces-C, now has to explicitly link with the
00165     // ICU converter dll. However, the advantage is that we no longer depend
00166     // on the code which does demand dynamic loading of DLL's. The demand
00167     // loading is highly system dependent and was a constant source of support
00168     // calls.
00169     UErrorCode uerr = U_ZERO_ERROR;
00170     udata_setCommonData((void *) icudata_dat, &uerr);
00171 #endif
00172 #endif
00173 }
00174 
00175 ICUTransService::~ICUTransService()
00176 {
00177     /*
00178      * commented out the following clean up code
00179      * in case users use ICU outside of the parser
00180      * if we clean up here, users' code may crash
00181      *
00182     #if (U_ICU_VERSION_MAJOR_NUM >= 2)
00183         // release all lazily allocated data
00184         u_cleanup();
00185     #endif
00186     */
00187 }
00188 
00189 
00190 // ---------------------------------------------------------------------------
00191 //  ICUTransService: The virtual transcoding service API
00192 // ---------------------------------------------------------------------------
00193 int ICUTransService::compareIString(const   XMLCh* const    comp1
00194                                     , const XMLCh* const    comp2)
00195 {
00196     size_t  i = 0;
00197     size_t  j = 0;
00198 
00199     for(;;)
00200     {
00201         UChar32 ch1;
00202         UChar32 ch2;
00203 
00204         U16_NEXT_UNSAFE(comp1, i, ch1);
00205         U16_NEXT_UNSAFE(comp2, j, ch2);
00206 
00207         const UChar32   folded1 =
00208             u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
00209 
00210         const UChar32   folded2 =
00211             u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
00212 
00213         if (folded1 !=
00214             folded2)
00215         {
00216             return folded1 - folded2;
00217         }
00218         else if (ch1 == 0)
00219         {
00220             // If ch1 is 0, the ch2 must also be
00221             // 0.  Otherwise, the previous if
00222             // would have failed.
00223             break;
00224         }
00225     }
00226 
00227     return 0;
00228 }
00229 
00230 
00231 int ICUTransService::compareNIString(const  XMLCh* const    comp1
00232                                     , const XMLCh* const    comp2
00233                                     , const XMLSize_t       maxChars)
00234 {
00235     if (maxChars > 0)
00236     {
00237         // Note that this function has somewhat broken semantics, as it's
00238         // possible for two strings of different lengths to compare as equal
00239         // in a case-insensitive manner, since one character could be
00240         // represented as a surrogate pair.
00241         size_t  i = 0;
00242         size_t  j = 0;
00243 
00244         for(;;)
00245         {
00246             UChar32 ch1;
00247             UChar32 ch2;
00248 
00249             U16_NEXT_UNSAFE(comp1, i, ch1);
00250             U16_NEXT_UNSAFE(comp2, j, ch2);
00251 
00252             const UChar32   folded1 =
00253                 u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
00254 
00255             const UChar32   folded2 =
00256                 u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
00257 
00258             if (folded1 != folded2)
00259             {
00260                 return folded1 - folded2;
00261             }
00262             else if (i == maxChars)
00263             {
00264                 // If we're at the end of both strings, return 0.
00265                 // Otherwise, we've run out of characters in the
00266                 // left string, so return -1.
00267                 return j == maxChars ? 0 : -1;
00268             }
00269             else if (j == maxChars)
00270             {
00271                 // We've run out of characters in the right string,
00272                 // but not the left, so return 1.
00273                 return 1;
00274             }
00275         }
00276     }
00277 
00278     return 0;
00279 }
00280 
00281 
00282 const XMLCh* ICUTransService::getId() const
00283 {
00284     return gMyServiceId;
00285 }
00286 
00287 XMLLCPTranscoder* ICUTransService::makeNewLCPTranscoder(MemoryManager* manager)
00288 {
00289     //
00290     //  Try to create a default converter. If it fails, return a null
00291     //  pointer which will basically cause the system to give up because
00292     //  we really can't do anything without one.
00293     //
00294     UErrorCode uerr = U_ZERO_ERROR;
00295     UConverter* converter = ucnv_open(NULL, &uerr);
00296     if (!converter)
00297         return 0;
00298 
00299     // That went ok, so create an ICU LCP transcoder wrapper and return it
00300     return new (manager) ICULCPTranscoder(converter);
00301 }
00302 
00303 
00304 bool ICUTransService::supportsSrcOfs() const
00305 {
00306     // This implementation supports source offset information
00307     return true;
00308 }
00309 
00310 
00311 template <class FunctionType>
00312 static void doCaseConvert(XMLCh*          convertString,
00313                           FunctionType    caseFunction)
00314 {
00315     // Note the semantics of this function are broken, since it's
00316     // possible that changing the case of a string could increase
00317     // its length, but there's no way to handle such a situation.
00318     const XMLSize_t len = XMLString::stringLen(convertString);
00319 
00320     size_t  readPos = 0;
00321     size_t  writePos = 0;
00322 
00323     while(readPos < len)
00324     {
00325         UChar32     original;
00326 
00327         // Get the next Unicode code point.
00328         U16_NEXT_UNSAFE(convertString, readPos, original);
00329 
00330         // Convert the code point
00331         const UChar32   converted = caseFunction(original);
00332 
00333         // OK, now here's where it gets ugly.
00334         if (!U_IS_BMP(converted) && U_IS_BMP(original) &&
00335             readPos - writePos == 1)
00336         {
00337             // We do not have room to convert the
00338             // character without overwriting the next
00339             // character, so we will just stop.
00340             break;
00341         }
00342         else
00343         {
00344             U16_APPEND_UNSAFE(convertString, writePos, converted);
00345         }
00346     }
00347 
00348     convertString[writePos] = 0;
00349 }
00350 
00351 
00352 
00353 void ICUTransService::upperCase(XMLCh* const toUpperCase)
00354 {
00355     doCaseConvert(toUpperCase, u_toupper);
00356 }
00357 
00358 void ICUTransService::lowerCase(XMLCh* const toLowerCase)
00359 {
00360     doCaseConvert(toLowerCase, u_tolower);
00361 }
00362 
00363 
00364 
00365 // ---------------------------------------------------------------------------
00366 //  ICUTransService: The protected virtual transcoding service API
00367 // ---------------------------------------------------------------------------
00368 XMLTranscoder* ICUTransService::
00369 makeNewXMLTranscoder(const  XMLCh* const            encodingName
00370                     ,       XMLTransService::Codes& resValue
00371                     , const XMLSize_t               blockSize
00372                     ,       MemoryManager* const    manager)
00373 {
00374     //
00375     //  For encodings that end with "s390" we need to strip off the "s390"
00376     //  from the encoding name and add ",swaplfnl" to the encoding name
00377     //  that we pass into ICU on the ucnv_openU.
00378     //
00379     XMLCh* encodingNameToUse = (XMLCh*) encodingName;
00380     XMLCh* workBuffer = 0;
00381 
00382     if ( (XMLString::endsWith(encodingNameToUse, gs390Id)) ||
00383          (XMLString::endsWith(encodingNameToUse, gS390Id)) )
00384     {
00385        XMLSize_t workBufferSize = (XMLString::stringLen(encodingNameToUse) + XMLString::stringLen(gswaplfnlId) - XMLString::stringLen(gS390Id) + 1);
00386        workBuffer = (XMLCh*) manager->allocate(workBufferSize * sizeof(XMLCh));
00387        XMLSize_t moveSize = XMLString::stringLen(encodingNameToUse) - XMLString::stringLen(gS390Id);
00388        XMLString::moveChars(workBuffer, encodingNameToUse, moveSize);
00389        XMLString::moveChars((workBuffer + moveSize), gswaplfnlId, XMLString::stringLen(gswaplfnlId));
00390        encodingNameToUse = workBuffer;
00391     }
00392 
00393     //
00394     //  If UChar and XMLCh are not the same size, then we have premassage the
00395     //  encoding name into a UChar type string.
00396     //
00397     const UChar* actualName;
00398     UChar* tmpName = 0;
00399     if (sizeof(UChar) == sizeof(XMLCh))
00400     {
00401         actualName = (const UChar*)encodingNameToUse;
00402     }
00403     else
00404     {
00405         tmpName = convertToUChar(encodingNameToUse, 0, manager);
00406         actualName = tmpName;
00407     }
00408 
00409     ArrayJanitor<UChar> janTmp(tmpName, manager);
00410     ArrayJanitor<XMLCh> janTmp1(workBuffer, manager);
00411 
00412     UErrorCode uerr = U_ZERO_ERROR;
00413     UConverter* converter = ucnv_openU(actualName, &uerr);
00414     if (!converter)
00415     {
00416         resValue = XMLTransService::UnsupportedEncoding;
00417         return 0;
00418     }
00419 
00420     return new (manager) ICUTranscoder(encodingName, converter, blockSize, manager);
00421 }
00422 
00423 
00424 
00425 
00426 // ---------------------------------------------------------------------------
00427 //  ICUTranscoder: Constructors and Destructor
00428 // ---------------------------------------------------------------------------
00429 ICUTranscoder::ICUTranscoder(const  XMLCh* const        encodingName
00430                             ,       UConverter* const   toAdopt
00431                             , const XMLSize_t           blockSize
00432                             , MemoryManager* const      manager) :
00433 
00434     XMLTranscoder(encodingName, blockSize, manager)
00435     , fConverter(toAdopt)
00436     , fFixed(false)
00437     , fSrcOffsets(0)
00438 {
00439     // If there is a block size, then allocate our source offset array
00440     if (blockSize)
00441         fSrcOffsets = (XMLUInt32*) manager->allocate
00442         (
00443             blockSize * sizeof(XMLUInt32)
00444         );//new XMLUInt32[blockSize];
00445 
00446     // Remember if its a fixed size encoding
00447     fFixed = (ucnv_getMaxCharSize(fConverter) == ucnv_getMinCharSize(fConverter));
00448 }
00449 
00450 ICUTranscoder::~ICUTranscoder()
00451 {
00452     getMemoryManager()->deallocate(fSrcOffsets);//delete [] fSrcOffsets;
00453 
00454     // If there is a converter, ask ICU to clean it up
00455     if (fConverter)
00456     {
00457         // <TBD> Does this actually delete the structure???
00458         ucnv_close(fConverter);
00459         fConverter = 0;
00460     }
00461 }
00462 
00463 
00464 // ---------------------------------------------------------------------------
00465 //  ICUTranscoder: The virtual transcoder API
00466 // ---------------------------------------------------------------------------
00467 XMLSize_t
00468 ICUTranscoder::transcodeFrom(const  XMLByte* const          srcData
00469                             , const XMLSize_t               srcCount
00470                             ,       XMLCh* const            toFill
00471                             , const XMLSize_t               maxChars
00472                             ,       XMLSize_t&              bytesEaten
00473                             ,       unsigned char* const    charSizes)
00474 {
00475     // Set up pointers to the start and end of the source buffer
00476     const XMLByte*  startSrc = srcData;
00477     const XMLByte*  endSrc = srcData + srcCount;
00478 
00479     //
00480     //  And now do the target buffer. This works differently according to
00481     //  whether XMLCh and UChar are the same size or not.
00482     //
00483     UChar* startTarget;
00484     if (sizeof(XMLCh) == sizeof(UChar))
00485         startTarget = (UChar*)toFill;
00486      else
00487         startTarget = (UChar*) getMemoryManager()->allocate
00488         (
00489             maxChars * sizeof(UChar)
00490         );//new UChar[maxChars];
00491     UChar* orgTarget = startTarget;
00492 
00493     //
00494     //  Transcode the buffer.  Buffer overflow errors are normal, occuring
00495     //  when the raw input buffer holds more characters than will fit in
00496     //  the Unicode output buffer.
00497     //
00498     UErrorCode  err = U_ZERO_ERROR;
00499     ucnv_toUnicode
00500     (
00501         fConverter
00502         , &startTarget
00503         , startTarget + maxChars
00504         , (const char**)&startSrc
00505         , (const char*)endSrc
00506         , (fFixed ? 0 : (int32_t*)fSrcOffsets)
00507         , false
00508         , &err
00509     );
00510 
00511     if ((err != U_ZERO_ERROR) && (err != U_BUFFER_OVERFLOW_ERROR))
00512     {
00513         if (orgTarget != (UChar*)toFill)
00514             getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget;
00515 
00516         if (fFixed)
00517         {
00518             XMLCh tmpBuf[17];
00519             XMLString::binToText((unsigned int)(*startTarget), tmpBuf, 16, 16, getMemoryManager());
00520             ThrowXMLwithMemMgr2
00521             (
00522                 TranscodingException
00523                 , XMLExcepts::Trans_BadSrcCP
00524                 , tmpBuf
00525                 , getEncodingName()
00526                 , getMemoryManager()
00527             );
00528         }
00529         else
00530         {
00531             ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
00532         }
00533     }
00534 
00535     // Calculate the bytes eaten and store in caller's param
00536     bytesEaten = startSrc - srcData;
00537 
00538     // And the characters decoded
00539     const XMLSize_t charsDecoded = startTarget - orgTarget;
00540 
00541     //
00542     //  Translate the array of char offsets into an array of character
00543     //  sizes, which is what the transcoder interface semantics requires.
00544     //  If its fixed, then we can optimize it.
00545     //
00546     if (fFixed)
00547     {
00548         const unsigned char fillSize = (unsigned char)ucnv_getMaxCharSize(fConverter);
00549         memset(charSizes, fillSize, maxChars);
00550     }
00551      else
00552     {
00553         //
00554         //  We have to convert the series of offsets into a series of
00555         //  sizes. If just one char was decoded, then its the total bytes
00556         //  eaten. Otherwise, do a loop and subtract out each element from
00557         //  its previous element.
00558         //
00559         if (charsDecoded == 1)
00560         {
00561             charSizes[0] = (unsigned char)bytesEaten;
00562         }
00563          else
00564         {
00565             //  ICU does not return an extra element to allow us to figure
00566             //  out the last char size, so we have to compute it from the
00567             //  total bytes used.
00568             unsigned int index;
00569             for (index = 0; index < charsDecoded - 1; index++)
00570             {
00571                 charSizes[index] = (unsigned char)(fSrcOffsets[index + 1]
00572                                                     - fSrcOffsets[index]);
00573             }
00574             if( charsDecoded > 0 ) {
00575                 charSizes[charsDecoded - 1] = (unsigned char)(bytesEaten
00576                                               - fSrcOffsets[charsDecoded - 1]);
00577             }
00578         }
00579     }
00580 
00581     //
00582     //  If XMLCh and UChar are not the same size, then we need to copy over
00583     //  the temp buffer to the new one.
00584     //
00585     if (sizeof(UChar) != sizeof(XMLCh))
00586     {
00587         XMLCh* outPtr = toFill;
00588         startTarget = orgTarget;
00589         for (unsigned int index = 0; index < charsDecoded; index++)
00590             *outPtr++ = XMLCh(*startTarget++);
00591 
00592         // And delete the temp buffer
00593         getMemoryManager()->deallocate(orgTarget);//delete [] orgTarget;
00594     }
00595 
00596     // Return the chars we put into the target buffer
00597     return charsDecoded;
00598 }
00599 
00600 
00601 XMLSize_t
00602 ICUTranscoder::transcodeTo( const   XMLCh* const    srcData
00603                             , const XMLSize_t       srcCount
00604                             ,       XMLByte* const  toFill
00605                             , const XMLSize_t       maxBytes
00606                             ,       XMLSize_t&      charsEaten
00607                             , const UnRepOpts       options)
00608 {
00609     //
00610     //  Get a pointer to the buffer to transcode. If UChar and XMLCh are
00611     //  the same size here, then use the original. Else, create a temp
00612     //  one and put a janitor on it.
00613     //
00614     const UChar* srcPtr;
00615     UChar* tmpBufPtr = 0;
00616     if (sizeof(XMLCh) == sizeof(UChar))
00617     {
00618         srcPtr = (const UChar*)srcData;
00619     }
00620     else
00621     {
00622         tmpBufPtr = convertToUChar(srcData, srcCount, getMemoryManager());
00623         srcPtr = tmpBufPtr;
00624     }
00625     ArrayJanitor<UChar> janTmpBuf(tmpBufPtr, getMemoryManager());
00626 
00627     //
00628     //  Set the appropriate callback so that it will either fail or use
00629     //  the rep char. Remember the old one so we can put it back.
00630     //
00631     UErrorCode  err = U_ZERO_ERROR;
00632     UConverterFromUCallback oldCB = NULL;
00633     #if (U_ICU_VERSION_MAJOR_NUM < 2)
00634     void* orgContent;
00635     #else
00636     const void* orgContent;
00637     #endif
00638     ucnv_setFromUCallBack
00639     (
00640         fConverter
00641         , (options == UnRep_Throw) ? UCNV_FROM_U_CALLBACK_STOP
00642                                    : UCNV_FROM_U_CALLBACK_SUBSTITUTE
00643         , NULL
00644         , &oldCB
00645         , &orgContent
00646         , &err
00647     );
00648 
00649     //
00650     //  Ok, lets transcode as many chars as we we can in one shot. The
00651     //  ICU API gives enough info not to have to do this one char by char.
00652     //
00653     XMLByte*        startTarget = toFill;
00654     const UChar*    startSrc = srcPtr;
00655     err = U_ZERO_ERROR;
00656     ucnv_fromUnicode
00657     (
00658         fConverter
00659         , (char**)&startTarget
00660         , (char*)(startTarget + maxBytes)
00661         , &startSrc
00662         , srcPtr + srcCount
00663         , 0
00664         , false
00665         , &err
00666     );
00667 
00668     // Rememember the status before we possibly overite the error code
00669     const bool res = (err == U_ZERO_ERROR);
00670 
00671     // Put the old handler back
00672     err = U_ZERO_ERROR;
00673     UConverterFromUCallback orgAction = NULL;
00674 
00675     ucnv_setFromUCallBack(fConverter, oldCB, NULL, &orgAction, &orgContent, &err);
00676 
00677     if (!res)
00678     {
00679         XMLCh tmpBuf[17];
00680         XMLString::binToText((unsigned int)*startSrc, tmpBuf, 16, 16, getMemoryManager());
00681         ThrowXMLwithMemMgr2
00682         (
00683             TranscodingException
00684             , XMLExcepts::Trans_Unrepresentable
00685             , tmpBuf
00686             , getEncodingName()
00687             , getMemoryManager()
00688         );
00689     }
00690 
00691     // Fill in the chars we ate from the input
00692     charsEaten = startSrc - srcPtr;
00693 
00694     // Return the chars we stored
00695     return startTarget - toFill;
00696 }
00697 
00698 
00699 bool ICUTranscoder::canTranscodeTo(const unsigned int toCheck)
00700 {
00701     //
00702     //  If the passed value is really a surrogate embedded together, then
00703     //  we need to break it out into its two chars. Else just one. While
00704     //  we are ate it, convert them to UChar format if required.
00705     //
00706     UChar           srcBuf[2];
00707     unsigned int    srcCount = 1;
00708     if (toCheck & 0xFFFF0000)
00709     {
00710         srcBuf[0] = UChar((toCheck >> 10) + 0xD800);
00711         srcBuf[1] = UChar(toCheck & 0x3FF) + 0xDC00;
00712         srcCount++;
00713     }
00714      else
00715     {
00716         srcBuf[0] = UChar(toCheck);
00717     }
00718 
00719     //
00720     //  Set the callback so that it will fail instead of using the rep char.
00721     //  Remember the old one so we can put it back.
00722     //
00723      UErrorCode  err = U_ZERO_ERROR;
00724      UConverterFromUCallback oldCB = NULL;
00725      #if (U_ICU_VERSION_MAJOR_NUM < 2)
00726      void* orgContent;
00727      #else
00728      const void* orgContent;
00729      #endif
00730 
00731      ucnv_setFromUCallBack
00732          (
00733          fConverter
00734          , UCNV_FROM_U_CALLBACK_STOP
00735          , NULL
00736          , &oldCB
00737          , &orgContent
00738          , &err
00739          );
00740 
00741     // Set upa temp buffer to format into. Make it more than big enough
00742     char            tmpBuf[64];
00743     char*           startTarget = tmpBuf;
00744     const UChar*    startSrc = srcBuf;
00745 
00746     err = U_ZERO_ERROR;
00747     ucnv_fromUnicode
00748     (
00749         fConverter
00750         , &startTarget
00751         , startTarget + 64
00752         , &startSrc
00753         , srcBuf + srcCount
00754         , 0
00755         , false
00756         , &err
00757     );
00758 
00759     // Save the result before we overight the error code
00760     const bool res = (err == U_ZERO_ERROR);
00761 
00762     // Put the old handler back
00763     err = U_ZERO_ERROR;
00764     UConverterFromUCallback orgAction = NULL;
00765 
00766     ucnv_setFromUCallBack(fConverter, oldCB, NULL, &orgAction, &orgContent, &err);
00767 
00768     return res;
00769 }
00770 
00771 
00772 
00773 // ---------------------------------------------------------------------------
00774 //  ICULCPTranscoder: Constructors and Destructor
00775 // ---------------------------------------------------------------------------
00776 ICULCPTranscoder::ICULCPTranscoder(UConverter* const toAdopt) :
00777 
00778     fConverter(toAdopt)
00779 {
00780 }
00781 
00782 ICULCPTranscoder::~ICULCPTranscoder()
00783 {
00784     // If there is a converter, ask ICU to clean it up
00785     if (fConverter)
00786     {
00787         // <TBD> Does this actually delete the structure???
00788         ucnv_close(fConverter);
00789         fConverter = 0;
00790     }
00791 }
00792 
00793 
00794 // ---------------------------------------------------------------------------
00795 //  ICULCPTranscoder: Constructors and Destructor
00796 // ---------------------------------------------------------------------------
00797 XMLSize_t ICULCPTranscoder::calcRequiredSize(const XMLCh* const srcText
00798                                                 , MemoryManager* const manager)
00799 {
00800     if (!srcText)
00801         return 0;
00802 
00803     //
00804     //  We do two different versions of this, according to whether XMLCh
00805     //  is the same size as UChar or not.
00806     //
00807     UErrorCode err = U_ZERO_ERROR;
00808     int32_t targetCap;
00809     if (sizeof(XMLCh) == sizeof(UChar))
00810     {
00811         // Use a faux scope to synchronize while we do this
00812         {
00813             XMLMutexLock lockConverter(&fMutex);
00814 
00815             targetCap = ucnv_fromUChars
00816             (
00817                 fConverter
00818                 , 0
00819                 , 0
00820                 , (const UChar*)srcText
00821                 , -1
00822                 , &err
00823             );
00824         }
00825     }
00826     else
00827     {
00828         // Copy the source to a local temp
00829         UChar* tmpBuf = convertToUChar(srcText, 0, manager);
00830         ArrayJanitor<UChar> janTmp(tmpBuf, manager);
00831 
00832         // Use a faux scope to synchronize while we do this
00833         {
00834             XMLMutexLock lockConverter(&fMutex);
00835 
00836             targetCap = ucnv_fromUChars
00837             (
00838                 fConverter
00839                 , 0
00840                 , 0
00841                 , tmpBuf
00842                 , -1
00843                 , &err
00844             );
00845         }
00846     }
00847 
00848     if (err != U_BUFFER_OVERFLOW_ERROR)
00849         return 0;
00850 
00851     return (XMLSize_t)targetCap;
00852 }
00853 
00854 XMLSize_t ICULCPTranscoder::calcRequiredSize(const char* const srcText
00855                                                 , MemoryManager* const /*manager*/)
00856 {
00857     if (!srcText)
00858         return 0;
00859 
00860     int32_t targetCap;
00861     UErrorCode err = U_ZERO_ERROR;
00862 
00863     // Use a faux scope to synchronize while we do this
00864     {
00865         XMLMutexLock lockConverter(&fMutex);
00866         targetCap = ucnv_toUChars
00867         (
00868             fConverter
00869             , 0
00870             , 0
00871             , srcText
00872             , (int32_t)strlen(srcText)
00873             , &err
00874         );
00875     }
00876 
00877     if (err != U_BUFFER_OVERFLOW_ERROR)
00878         return 0;
00879 
00880 #if (U_ICU_VERSION_MAJOR_NUM < 2)
00881     // Subtract one since it includes the terminator space
00882     return (XMLSize_t)(targetCap - 1);
00883 #else
00884     // Starting ICU 2.0, this is fixed and all ICU String functions have consistent NUL-termination behavior.
00885     // The returned length is always the number of output UChar's, not counting an additional, terminating NUL.
00886     return (XMLSize_t)(targetCap);
00887 #endif
00888 }
00889 
00890 
00891 char* ICULCPTranscoder::transcode(const XMLCh* const toTranscode,
00892                                   MemoryManager* const manager)
00893 {
00894     char* retBuf = 0;
00895 
00896     // Check for a couple of special cases
00897     if (!toTranscode)
00898         return retBuf;
00899 
00900     if (!*toTranscode)
00901     {
00902         retBuf = (char*) manager->allocate(sizeof(char));//new char[1];
00903         retBuf[0] = 0;
00904         return retBuf;
00905     }
00906 
00907     //
00908     //  Get the length of the source string since we'll have to use it in
00909     //  a couple places below.
00910     //
00911     const XMLSize_t srcLen = XMLString::stringLen(toTranscode);
00912 
00913     //
00914     //  If XMLCh and UChar are not the same size, then we have to make a
00915     //  temp copy of the text to pass to ICU.
00916     //
00917     const UChar* actualSrc;
00918     UChar* ncActual = 0;
00919     if (sizeof(XMLCh) == sizeof(UChar))
00920     {
00921         actualSrc = (const UChar*)toTranscode;
00922     }
00923      else
00924     {
00925         // Allocate a non-const temp buf, but store it also in the actual
00926         ncActual = convertToUChar(toTranscode, 0, manager);
00927         actualSrc = ncActual;
00928     }
00929 
00930     // Insure that the temp buffer, if any, gets cleaned up via the nc pointer
00931     ArrayJanitor<UChar> janTmp(ncActual, manager);
00932 
00933     // Caculate a return buffer size not too big, but less likely to overflow
00934     int32_t targetLen = (int32_t)(srcLen * 1.25);
00935 
00936     // Allocate the return buffer
00937     retBuf = (char*) manager->allocate((targetLen + 1) * sizeof(char));//new char[targetLen + 1];
00938 
00939     //
00940     //  Lock now while we call the converter. Use a faux block to do the
00941     //  lock so that it unlocks immediately afterwards.
00942     //
00943     UErrorCode err = U_ZERO_ERROR;
00944     int32_t targetCap;
00945     {
00946         XMLMutexLock lockConverter(&fMutex);
00947 
00948         targetCap = ucnv_fromUChars
00949         (
00950             fConverter
00951             , retBuf
00952             , targetLen + 1
00953             , actualSrc
00954             , -1
00955             , &err
00956         );
00957     }
00958 
00959     // If targetLen is not enough then buffer overflow might occur
00960     if ((err == U_BUFFER_OVERFLOW_ERROR) || (err == U_STRING_NOT_TERMINATED_WARNING))
00961     {
00962         //
00963         //  Reset the error, delete the old buffer, allocate a new one,
00964         //  and try again.
00965         //
00966         err = U_ZERO_ERROR;
00967         manager->deallocate(retBuf);//delete [] retBuf;
00968         retBuf = (char*) manager->allocate((targetCap + 1) * sizeof(char));//new char[targetCap + 1];
00969 
00970         // Lock again before we retry
00971         XMLMutexLock lockConverter(&fMutex);
00972         targetCap = ucnv_fromUChars
00973         (
00974             fConverter
00975             , retBuf
00976             , targetCap + 1
00977             , actualSrc
00978             , -1
00979             , &err
00980         );
00981     }
00982 
00983     if (U_FAILURE(err))
00984     {
00985         manager->deallocate(retBuf);//delete [] retBuf;
00986         return 0;
00987     }
00988 
00989     return retBuf;
00990 }
00991 
00992 XMLCh* ICULCPTranscoder::transcode(const char* const toTranscode,
00993                                    MemoryManager* const manager)
00994 {
00995     // Watch for a few pyscho corner cases
00996     if (!toTranscode)
00997         return 0;
00998 
00999     if (!*toTranscode)
01000     {
01001         XMLCh* retVal = (XMLCh*) manager->allocate(sizeof(XMLCh));//new XMLCh[1];
01002         retVal[0] = 0;
01003         return retVal;
01004     }
01005 
01006     //
01007     //  Get the length of the string to transcode. The Unicode string will
01008     //  almost always be no more chars than were in the source, so this is
01009     //  the best guess as to the storage needed.
01010     //
01011     const int32_t srcLen = (int32_t)strlen(toTranscode);
01012 
01013     // We need a target buffer of UChars to fill in
01014     UChar* targetBuf = 0;
01015 
01016     // Now lock while we do these calculations
01017     UErrorCode err = U_ZERO_ERROR;
01018     int32_t targetCap;
01019     {
01020         XMLMutexLock lockConverter(&fMutex);
01021 
01022         //
01023         //  Here we don't know what the target length will be so use 0 and
01024         //  expect an U_BUFFER_OVERFLOW_ERROR in which case it'd get resolved
01025         //  by the correct capacity value.
01026         //
01027         targetCap = ucnv_toUChars
01028         (
01029             fConverter
01030             , 0
01031             , 0
01032             , toTranscode
01033             , srcLen
01034             , &err
01035         );
01036 
01037         if (err != U_BUFFER_OVERFLOW_ERROR)
01038             return 0;
01039 
01040         err = U_ZERO_ERROR;
01041         targetBuf = (UChar*) manager->allocate((targetCap+1) * sizeof(UChar));//new UChar[targetCap + 1];
01042         ucnv_toUChars
01043         (
01044             fConverter
01045             , targetBuf
01046             , targetCap + 1
01047             , toTranscode
01048             , srcLen
01049             , &err
01050         );
01051     }
01052 
01053     if (U_FAILURE(err))
01054     {
01055         // Clean up if we got anything allocated
01056         manager->deallocate(targetBuf);//delete [] targetBuf;
01057         return 0;
01058     }
01059 
01060     // Cap it off to make sure
01061     targetBuf[targetCap] = 0;
01062 
01063     //
01064     //  If XMLCh and UChar are the same size, then we can return retVal
01065     //  as is. Else, we have to allocate another buffer and copy the data
01066     //  over to it.
01067     //
01068     XMLCh* actualRet;
01069     if (sizeof(XMLCh) == sizeof(UChar))
01070     {
01071         actualRet = (XMLCh*)targetBuf;
01072     }
01073      else
01074     {
01075         actualRet = convertToXMLCh(targetBuf, manager);
01076         manager->deallocate(targetBuf);//delete [] targetBuf;
01077     }
01078     return actualRet;
01079 }
01080 
01081 
01082 bool ICULCPTranscoder::transcode(const  char* const     toTranscode
01083                                 ,       XMLCh* const    toFill
01084                                 , const XMLSize_t       maxChars
01085                                 , MemoryManager* const  manager)
01086 {
01087     // Check for a couple of psycho corner cases
01088     if (!toTranscode || !maxChars)
01089     {
01090         toFill[0] = 0;
01091         return true;
01092     }
01093 
01094     if (!*toTranscode)
01095     {
01096         toFill[0] = 0;
01097         return true;
01098     }
01099 
01100     // We'll need this in a couple of places below
01101     const XMLSize_t srcLen = strlen(toTranscode);
01102 
01103     //
01104     //  Set up the target buffer. If XMLCh and UChar are not the same size
01105     //  then we have to use a temp buffer and convert over.
01106     //
01107     UChar* targetBuf;
01108     if (sizeof(XMLCh) == sizeof(UChar))
01109         targetBuf = (UChar*)toFill;
01110     else
01111         targetBuf = (UChar*) manager->allocate
01112         (
01113             (maxChars + 1) * sizeof(UChar)
01114         );//new UChar[maxChars + 1];
01115 
01116     //
01117     //  Use a faux block to enforce a lock on the converter, which will
01118     //  unlock immediately after its completed.
01119     //
01120     UErrorCode err = U_ZERO_ERROR;
01121     {
01122         XMLMutexLock lockConverter(&fMutex);
01123         ucnv_toUChars
01124         (
01125             fConverter
01126             , targetBuf
01127             , (int32_t)maxChars + 1
01128             , toTranscode
01129             , (int32_t)srcLen
01130             , &err
01131         );
01132     }
01133 
01134     if (U_FAILURE(err))
01135     {
01136         if (targetBuf != (UChar*)toFill)
01137             manager->deallocate(targetBuf);//delete [] targetBuf;
01138         return false;
01139     }
01140 
01141     // If the sizes are not the same, then copy the data over
01142     if (sizeof(XMLCh) != sizeof(UChar))
01143     {
01144         UChar* srcPtr = targetBuf;
01145         XMLCh* outPtr = toFill;
01146         while (*srcPtr)
01147             *outPtr++ = XMLCh(*srcPtr++);
01148         *outPtr = 0;
01149 
01150         // And delete the temp buffer
01151         manager->deallocate(targetBuf);//delete [] targetBuf;
01152     }
01153 
01154     return true;
01155 }
01156 
01157 
01158 bool ICULCPTranscoder::transcode(   const   XMLCh* const    toTranscode
01159                                     ,       char* const     toFill
01160                                     , const XMLSize_t       maxChars
01161                                     , MemoryManager* const  manager)
01162 {
01163     // Watch for a few psycho corner cases
01164     if (!toTranscode || !maxChars)
01165     {
01166         toFill[0] = 0;
01167         return true;
01168     }
01169 
01170     if (!*toTranscode)
01171     {
01172         toFill[0] = 0;
01173         return true;
01174     }
01175 
01176     //
01177     //  If XMLCh and UChar are not the same size, then we have to make a
01178     //  temp copy of the text to pass to ICU.
01179     //
01180     const UChar* actualSrc;
01181     UChar* ncActual = 0;
01182     if (sizeof(XMLCh) == sizeof(UChar))
01183     {
01184         actualSrc = (const UChar*)toTranscode;
01185     }
01186      else
01187     {
01188         // Allocate a non-const temp buf, but store it also in the actual
01189         ncActual = convertToUChar(toTranscode, 0, manager);
01190         actualSrc = ncActual;
01191     }
01192 
01193     // Insure that the temp buffer, if any, gets cleaned up via the nc pointer
01194     ArrayJanitor<UChar> janTmp(ncActual, manager);
01195 
01196     //
01197     //  Use a faux block to enforce a lock on the converter while we do this.
01198     //  It will be released immediately after its done.
01199     //
01200     UErrorCode err = U_ZERO_ERROR;
01201     int32_t targetCap;
01202     {
01203         XMLMutexLock lockConverter(&fMutex);
01204         targetCap = ucnv_fromUChars
01205         (
01206             fConverter
01207             , toFill
01208             , (int32_t)maxChars
01209             , actualSrc
01210             , -1
01211             , &err
01212         );
01213     }
01214 
01215     if (U_FAILURE(err))
01216         return false;
01217 
01218     toFill[targetCap] = 0;
01219     return true;
01220 }
01221 
01222 XERCES_CPP_NAMESPACE_END