GME  13
XMLUCS4Transcoder.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 
00019 // ---------------------------------------------------------------------------
00020 //  Includes
00021 // ---------------------------------------------------------------------------
00022 #include <xercesc/util/BitOps.hpp>
00023 #include <xercesc/util/XMLUCS4Transcoder.hpp>
00024 #include <xercesc/util/TranscodingException.hpp>
00025 #include <string.h>
00026 
00027 XERCES_CPP_NAMESPACE_BEGIN
00028 
00029 // ---------------------------------------------------------------------------
00030 //  XMLUCS4Transcoder: Constructors and Destructor
00031 // ---------------------------------------------------------------------------
00032 XMLUCS4Transcoder::XMLUCS4Transcoder(const  XMLCh* const    encodingName
00033                                     , const XMLSize_t       blockSize
00034                                     , const bool            swapped
00035                                     , MemoryManager* const manager) :
00036 
00037     XMLTranscoder(encodingName, blockSize, manager)
00038     , fSwapped(swapped)
00039 {
00040 }
00041 
00042 
00043 XMLUCS4Transcoder::~XMLUCS4Transcoder()
00044 {
00045 }
00046 
00047 
00048 // ---------------------------------------------------------------------------
00049 //  XMLUCS4Transcoder: Implementation of the transcoder API
00050 // ---------------------------------------------------------------------------
00051 XMLSize_t
00052 XMLUCS4Transcoder::transcodeFrom(const  XMLByte* const          srcData
00053                                 , const XMLSize_t               srcCount
00054                                 ,       XMLCh* const            toFill
00055                                 , const XMLSize_t               maxChars
00056                                 ,       XMLSize_t&              bytesEaten
00057                                 ,       unsigned char* const    charSizes)
00058 {
00059     //
00060     //  Get pointers to the start and end of the source buffer in terms of
00061     //  UCS-4 characters.
00062     //
00063     const UCS4Ch*   srcPtr = (const UCS4Ch*)srcData;
00064     const UCS4Ch*   srcEnd = srcPtr + (srcCount / sizeof(UCS4Ch));
00065 
00066     //
00067     //  Get pointers to the start and end of the target buffer, which is
00068     //  in terms of the XMLCh chars we output.
00069     //
00070     XMLCh*  outPtr = toFill;
00071     XMLCh*  outEnd = toFill + maxChars;
00072 
00073     //
00074     //  And get a pointer into the char sizes buffer. We will run this
00075     //  up as we put chars into the output buffer.
00076     //
00077     unsigned char* sizePtr = charSizes;
00078 
00079     //
00080     //  Now process chars until we either use up all our source or all of
00081     //  our output space.
00082     //
00083     while ((outPtr < outEnd) && (srcPtr < srcEnd))
00084     {
00085         //
00086         //  Get the next UCS char out of the buffer. Don't bump the ptr
00087         //  yet since we might not have enough storage for it in the target
00088         //  (if its causes a surrogate pair to be created.
00089         //
00090         UCS4Ch nextVal = *srcPtr;
00091 
00092         // If it needs to be swapped, then do it
00093         if (fSwapped)
00094             nextVal = BitOps::swapBytes(nextVal);
00095 
00096         // Handle a surrogate pair if needed
00097         if (nextVal & 0xFFFF0000)
00098         {
00099             //
00100             //  If we don't have room for both of the chars, then we
00101             //  bail out now.
00102             //
00103             if (outPtr + 1 == outEnd)
00104                 break;
00105 
00106             const XMLInt32 LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
00107                 const XMLCh ch1 = XMLCh(LEAD_OFFSET + (nextVal >> 10));
00108                 const XMLCh ch2 = XMLCh(0xDC00 + (nextVal & 0x3FF));
00109 
00110             //
00111             //  We have room so store them both. But note that the
00112             //  second one took up no source bytes!
00113             //
00114             *sizePtr++ = sizeof(UCS4Ch);
00115             *outPtr++ = ch1;
00116             *sizePtr++ = 0;
00117             *outPtr++ = ch2;
00118         }
00119          else
00120         {
00121             //
00122             //  No surrogate, so just store it and bump the count of chars
00123             //  read. Update the char sizes buffer for this char's entry.
00124             //
00125             *sizePtr++ = sizeof(UCS4Ch);
00126             *outPtr++ = XMLCh(nextVal);
00127         }
00128 
00129         // Indicate that we ate another UCS char's worth of bytes
00130         srcPtr++;
00131     }
00132 
00133     // Set the bytes eaten parameter
00134     bytesEaten = ((const XMLByte*)srcPtr) - srcData;
00135 
00136     // And return the chars written into the output buffer
00137     return outPtr - toFill;
00138 }
00139 
00140 
00141 XMLSize_t
00142 XMLUCS4Transcoder::transcodeTo( const   XMLCh* const    srcData
00143                                 , const XMLSize_t       srcCount
00144                                 ,       XMLByte* const  toFill
00145                                 , const XMLSize_t       maxBytes
00146                                 ,       XMLSize_t&      charsEaten
00147                                 , const UnRepOpts)
00148 {
00149     //
00150     //  Get pointers to the start and end of the source buffer, which
00151     //  is in terms of XMLCh chars.
00152     //
00153     const XMLCh*  srcPtr = srcData;
00154     const XMLCh*  srcEnd = srcData + srcCount;
00155 
00156     //
00157     //  Get pointers to the start and end of the target buffer, in terms
00158     //  of UCS-4 chars.
00159     //
00160     UCS4Ch*   outPtr = (UCS4Ch*)toFill;
00161     UCS4Ch*   outEnd = outPtr + (maxBytes / sizeof(UCS4Ch));
00162 
00163     //
00164     //  Now loop until we either run out of source characters or we
00165     //  fill up our output buffer.
00166     //
00167     XMLCh trailCh;
00168     while ((outPtr < outEnd) && (srcPtr < srcEnd))
00169     {
00170         //
00171         //  Get out an XMLCh char from the source. Don't bump up the
00172         //  pointer yet, since it might be a leading for which we don't
00173         //  have the trailing.
00174         //
00175         const XMLCh curCh = *srcPtr;
00176 
00177         //
00178         //  If its a leading char of a surrogate pair handle it one way,
00179         //  else just cast it over into the target.
00180         //
00181         if ((curCh >= 0xD800) && (curCh <= 0xDBFF))
00182         {
00183             //
00184             //  Ok, we have to have another source char available or we
00185             //  just give up without eating the leading char.
00186             //
00187             if (srcPtr + 1 == srcEnd)
00188                 break;
00189 
00190             //
00191             //  We have the trailing char, so eat the first char and the
00192             //  trailing char from the source.
00193             //
00194             srcPtr++;
00195             trailCh = *srcPtr++;
00196 
00197             //
00198             //  Then make sure its a legal trailing char. If not, throw
00199             //  an exception.
00200             //
00201             if ( !( (trailCh >= 0xDC00) && (trailCh <= 0xDFFF) ) )
00202                 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadTrailingSurrogate, getMemoryManager());
00203 
00204             // And now combine the two into a single output char
00205             const XMLInt32 SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
00206             *outPtr++ = (curCh << 10) + trailCh + SURROGATE_OFFSET;
00207         }
00208          else
00209         {
00210             //
00211             //  Its just a char, so we can take it as is. If we need to
00212             //  swap it, then swap it. Because of flakey compilers, use
00213             //  a temp first.
00214             //
00215             const UCS4Ch tmpCh = UCS4Ch(curCh);
00216             if (fSwapped)
00217                 *outPtr++ = BitOps::swapBytes(tmpCh);
00218             else
00219                 *outPtr++ = tmpCh;
00220 
00221             // Bump the source pointer
00222             srcPtr++;
00223         }
00224     }
00225 
00226     // Set the chars we ate from the source
00227     charsEaten = srcPtr - srcData;
00228 
00229     // Return the bytes we wrote to the output
00230     return ((XMLByte*)outPtr) - toFill;
00231 }
00232 
00233 
00234 bool XMLUCS4Transcoder::canTranscodeTo(const unsigned int)
00235 {
00236     // We can handle anything
00237     return true;
00238 }
00239 
00240 XERCES_CPP_NAMESPACE_END