GME  13
XMLUTF8Transcoder.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/util/TranscodingException.hpp>
00026 #include <xercesc/util/XMLString.hpp>
00027 #include <xercesc/util/XMLUniDefs.hpp>
00028 #include <xercesc/util/XMLUTF8Transcoder.hpp>
00029 
00030 XERCES_CPP_NAMESPACE_BEGIN
00031 
00032 // ---------------------------------------------------------------------------
00033 //  Local static data
00034 //
00035 //  gUTFBytes
00036 //      A list of counts of trailing bytes for each initial byte in the input.
00037 //
00038 //  gUTFByteIndicator
00039 //      For a UTF8 sequence of n bytes, n>=2, the first byte of the
00040 //      sequence must contain n 1's followed by precisely 1 0 with the
00041 //      rest of the byte containing arbitrary bits.  This array stores
00042 //      the required bit pattern for validity checking.
00043 //  gUTFByteIndicatorTest
00044 //      When bitwise and'd with the observed value, if the observed
00045 //      value is correct then a result matching gUTFByteIndicator will
00046 //      be produced.
00047 //
00048 //  gUTFOffsets
00049 //      A list of values to offset each result char type, according to how
00050 //      many source bytes when into making it.
00051 //
00052 //  gFirstByteMark
00053 //      A list of values to mask onto the first byte of an encoded sequence,
00054 //      indexed by the number of bytes used to create the sequence.
00055 // ---------------------------------------------------------------------------
00056 static const XMLByte gUTFBytes[256] =
00057 {
00058         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00059     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00060     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00061     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00062     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00063     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00064     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00065     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00066     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00067     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00068     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00069     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
00070     ,   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
00071     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
00072     ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00073     ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
00074 };
00075 
00076 static const XMLByte gUTFByteIndicator[6] =
00077 {
00078     0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
00079 };
00080 static const XMLByte gUTFByteIndicatorTest[6] =
00081 {
00082     0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
00083 };
00084 
00085 static const XMLUInt32 gUTFOffsets[6] =
00086 {
00087     0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
00088 };
00089 
00090 static const XMLByte gFirstByteMark[7] =
00091 {
00092     0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
00093 };
00094 
00095 
00096 
00097 // ---------------------------------------------------------------------------
00098 //  XMLUTF8Transcoder: Constructors and Destructor
00099 // ---------------------------------------------------------------------------
00100 XMLUTF8Transcoder::XMLUTF8Transcoder(const  XMLCh* const    encodingName
00101                                     , const XMLSize_t       blockSize
00102                                     , MemoryManager* const  manager)
00103 :XMLTranscoder(encodingName, blockSize, manager)
00104 {
00105 }
00106 
00107 XMLUTF8Transcoder::~XMLUTF8Transcoder()
00108 {
00109 }
00110 
00111 
00112 // ---------------------------------------------------------------------------
00113 //  XMLUTF8Transcoder: Implementation of the transcoder API
00114 // ---------------------------------------------------------------------------
00115 XMLSize_t
00116 XMLUTF8Transcoder::transcodeFrom(const  XMLByte* const          srcData
00117                                 , const XMLSize_t               srcCount
00118                                 ,       XMLCh* const            toFill
00119                                 , const XMLSize_t               maxChars
00120                                 ,       XMLSize_t&              bytesEaten
00121                                 ,       unsigned char* const    charSizes)
00122 {
00123     // Watch for pathological scenario. Shouldn't happen, but...
00124     if (!srcCount || !maxChars)
00125         return 0;
00126 
00127     //
00128     //  Get pointers to our start and end points of the input and output
00129     //  buffers.
00130     //
00131     const XMLByte*  srcPtr = srcData;
00132     const XMLByte*  srcEnd = srcPtr + srcCount;
00133     XMLCh*          outPtr = toFill;
00134     XMLCh*          outEnd = outPtr + maxChars;
00135     unsigned char*  sizePtr = charSizes;
00136 
00137 
00138 
00139     //
00140     //  We now loop until we either run out of input data, or room to store
00141     //  output chars.
00142     //
00143     while ((srcPtr < srcEnd) && (outPtr < outEnd))
00144     {
00145         // Special-case ASCII, which is a leading byte value of <= 127
00146         if (*srcPtr <= 127)
00147         {
00148             // Handle ASCII in groups instead of single character at a time.
00149             const XMLByte* srcPtr_save = srcPtr;
00150             const XMLSize_t chunkSize = (srcEnd-srcPtr)<(outEnd-outPtr)?(srcEnd-srcPtr):(outEnd-outPtr);
00151             for(XMLSize_t i=0;i<chunkSize && *srcPtr <= 127;++i)
00152                 *outPtr++ = XMLCh(*srcPtr++);
00153             memset(sizePtr,1,srcPtr - srcPtr_save);
00154             sizePtr += srcPtr - srcPtr_save;
00155             if (srcPtr == srcEnd || outPtr == outEnd)
00156                 break;
00157         }
00158 
00159         // See how many trailing src bytes this sequence is going to require
00160         const unsigned int trailingBytes = gUTFBytes[*srcPtr];
00161 
00162         //
00163         //  If there are not enough source bytes to do this one, then we
00164         //  are done. Note that we done >= here because we are implicitly
00165         //  counting the 1 byte we get no matter what.
00166         //
00167         //  If we break out here, then there is nothing to undo since we
00168         //  haven't updated any pointers yet.
00169         //
00170         if (srcPtr + trailingBytes >= srcEnd)
00171             break;
00172 
00173         // Looks ok, so lets build up the value
00174         // or at least let's try to do so--remembering that
00175         // we cannot assume the encoding to be valid:
00176 
00177         // first, test first byte
00178         if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {
00179             char pos[2] = {(char)0x31, 0}; 
00180             char len[2] = {(char)(trailingBytes+0x31), 0};
00181             char byte[2] = {*srcPtr,0};
00182             ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());
00183         }
00184 
00185         /***
00186          * http://www.unicode.org/reports/tr27/
00187          *
00188          * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. 
00189          * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) 
00190          * is legal in that position. 
00191          * Any byte value outside of the ranges listed is illegal. 
00192          * For example, 
00193          * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column. 
00194          * The byte sequence <E0 9F 80> is illegal since in the row 
00195          *    where E0 is legal as a first byte, 
00196          *    9F is not legal as a second byte. 
00197          * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches 
00198          * a byte range in a row of the table (the last row). 
00199          *
00200          *
00201          * Table 3.1B. Legal UTF-8 Byte Sequences  
00202          * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte 
00203          * =========================================================================
00204          * U+0000..U+007F            00..7F       
00205          * -------------------------------------------------------------------------
00206          * U+0080..U+07FF            C2..DF      80..BF      
00207          *
00208          * -------------------------------------------------------------------------
00209          * U+0800..U+0FFF            E0          A0..BF     80..BF   
00210          *                                       -- 
00211          *                          
00212          * U+1000..U+FFFF            E1..EF      80..BF     80..BF    
00213          *
00214          * --------------------------------------------------------------------------
00215          * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF 
00216          *                                       --
00217          * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF 
00218          * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF 
00219          *                                           --
00220          * ==========================================================================
00221          *
00222          *  Cases where a trailing byte range is not 80..BF are underlined in the table to 
00223          *  draw attention to them. These occur only in the second byte of a sequence.
00224          *
00225          ***/
00226 
00227         XMLUInt32 tmpVal = 0;
00228 
00229         switch(trailingBytes)
00230         {
00231             case 1 :
00232                 // UTF-8:   [110y yyyy] [10xx xxxx]
00233                 // Unicode: [0000 0yyy] [yyxx xxxx]
00234                 //
00235                 // 0xC0, 0xC1 has been filtered out             
00236                 checkTrailingBytes(*(srcPtr+1), 1, 1);
00237 
00238                 tmpVal = *srcPtr++;
00239                 tmpVal <<= 6;
00240                 tmpVal += *srcPtr++;
00241 
00242                 break;
00243             case 2 :
00244                 // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
00245                 // Unicode: [zzzz yyyy] [yyxx xxxx]
00246                 //
00247                 if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) 
00248                 {
00249                     char byte0[2] = {*srcPtr    ,0};
00250                     char byte1[2] = {*(srcPtr+1),0};
00251 
00252                     ThrowXMLwithMemMgr2(UTFDataFormatException
00253                                       , XMLExcepts::UTF8_Invalid_3BytesSeq
00254                                       , byte0
00255                                       , byte1
00256                                       , getMemoryManager());
00257                 }
00258 
00259                 checkTrailingBytes(*(srcPtr+1), 2, 1);
00260                 checkTrailingBytes(*(srcPtr+2), 2, 2);
00261 
00262                 //
00263                 // D36 (a) UTF-8 is the Unicode Transformation Format that serializes 
00264                 //         a Unicode code point as a sequence of one to four bytes, 
00265                 //         as specified in Table 3.1, UTF-8 Bit Distribution.
00266                 //     (b) An illegal UTF-8 code unit sequence is any byte sequence that 
00267                 //         does not match the patterns listed in Table 3.1B, Legal UTF-8 
00268                 //         Byte Sequences.
00269                 //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence 
00270                 //         where the first three bytes correspond to a high surrogate, 
00271                 //         and the next three bytes correspond to a low surrogate. 
00272                 //         As a consequence of C12, these irregular UTF-8 sequences shall 
00273                 //         not be generated by a conformant process. 
00274                 //
00275                 //irregular three bytes sequence
00276                 // that is zzzzyy matches leading surrogate tag 110110 or 
00277                 //                       trailing surrogate tag 110111
00278                 // *srcPtr=1110 1101 
00279                 // *(srcPtr+1)=1010 yyyy or 
00280                 // *(srcPtr+1)=1011 yyyy
00281                 //
00282                 // 0xED 1110 1101
00283                 // 0xA0 1010 0000
00284 
00285                 if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))
00286                 {
00287                     char byte0[2] = {*srcPtr,    0};
00288                     char byte1[2] = {*(srcPtr+1),0};
00289 
00290                      ThrowXMLwithMemMgr2(UTFDataFormatException
00291                               , XMLExcepts::UTF8_Irregular_3BytesSeq
00292                               , byte0
00293                               , byte1
00294                               , getMemoryManager());
00295                 }
00296 
00297                 tmpVal = *srcPtr++;
00298                 tmpVal <<= 6;
00299                 tmpVal += *srcPtr++;
00300                 tmpVal <<= 6;
00301                 tmpVal += *srcPtr++;
00302 
00303                 break;
00304             case 3 : 
00305                 // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
00306                 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
00307                 //          [1101 11yy] [yyxx xxxx] (low surrogate)
00308                 //          * uuuuu = wwww + 1
00309                 //
00310                 if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||
00311                     ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )
00312                 {
00313                     char byte0[2] = {*srcPtr    ,0};
00314                     char byte1[2] = {*(srcPtr+1),0};
00315 
00316                     ThrowXMLwithMemMgr2(UTFDataFormatException
00317                                       , XMLExcepts::UTF8_Invalid_4BytesSeq
00318                                       , byte0
00319                                       , byte1
00320                                       , getMemoryManager());
00321                 }
00322 
00323                 checkTrailingBytes(*(srcPtr+1), 3, 1);
00324                 checkTrailingBytes(*(srcPtr+2), 3, 2);
00325                 checkTrailingBytes(*(srcPtr+3), 3, 3);
00326                 
00327                 tmpVal = *srcPtr++;
00328                 tmpVal <<= 6;
00329                 tmpVal += *srcPtr++;
00330                 tmpVal <<= 6;
00331                 tmpVal += *srcPtr++;
00332                 tmpVal <<= 6;
00333                 tmpVal += *srcPtr++;
00334 
00335                 break;
00336             default: // trailingBytes > 3
00337 
00338                 /***
00339                  * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows 
00340                  * for the use of five- and six-byte sequences to encode characters that 
00341                  * are outside the range of the Unicode character set; those five- and 
00342                  * six-byte sequences are illegal for the use of UTF-8 as a transformation 
00343                  * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired 
00344                  * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
00345                  ***/
00346                 char len[2]  = {(char)(trailingBytes+0x31), 0};
00347                 char byte[2] = {*srcPtr,0};
00348 
00349                 ThrowXMLwithMemMgr2(UTFDataFormatException
00350                                   , XMLExcepts::UTF8_Exceeds_BytesLimit
00351                                   , byte
00352                                   , len
00353                                   , getMemoryManager());
00354 
00355                 break;
00356         }
00357 
00358 
00359         // since trailingBytes comes from an array, this logic is redundant
00360         //  default :
00361         //      ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
00362         //}
00363         tmpVal -= gUTFOffsets[trailingBytes];
00364 
00365         //
00366         //  If it will fit into a single char, then put it in. Otherwise
00367         //  encode it as a surrogate pair. If its not valid, use the
00368         //  replacement char.
00369         //
00370         if (!(tmpVal & 0xFFFF0000))
00371         {
00372             *sizePtr++ = trailingBytes + 1;
00373             *outPtr++ = XMLCh(tmpVal);
00374         }
00375          else if (tmpVal > 0x10FFFF)
00376         {
00377             //
00378             //  If we've gotten more than 32 chars so far, then just break
00379             //  out for now and lets process those. When we come back in
00380             //  here again, we'll get no chars and throw an exception. This
00381             //  way, the error will have a line and col number closer to
00382             //  the real problem area.
00383             //
00384             if ((outPtr - toFill) > 32)
00385                 break;
00386 
00387             ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());
00388         }
00389          else
00390         {
00391             //
00392             //  If we have enough room to store the leading and trailing
00393             //  chars, then lets do it. Else, pretend this one never
00394             //  happened, and leave it for the next time. Since we don't
00395             //  update the bytes read until the bottom of the loop, by
00396             //  breaking out here its like it never happened.
00397             //
00398             if (outPtr + 1 >= outEnd)
00399                 break;
00400 
00401             // Store the leading surrogate char
00402             tmpVal -= 0x10000;
00403             *sizePtr++ = trailingBytes + 1;
00404             *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);
00405 
00406             //
00407             //  And then the trailing char. This one accounts for no
00408             //  bytes eaten from the source, so set the char size for this
00409             //  one to be zero.
00410             //
00411             *sizePtr++ = 0;
00412             *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);
00413         }
00414     }
00415 
00416     // Update the bytes eaten
00417     bytesEaten = srcPtr - srcData;
00418 
00419     // Return the characters read
00420     return outPtr - toFill;
00421 }
00422 
00423 
00424 XMLSize_t
00425 XMLUTF8Transcoder::transcodeTo( const   XMLCh* const    srcData
00426                                 , const XMLSize_t       srcCount
00427                                 ,       XMLByte* const  toFill
00428                                 , const XMLSize_t       maxBytes
00429                                 ,       XMLSize_t&      charsEaten
00430                                 , const UnRepOpts       options)
00431 {
00432     // Watch for pathological scenario. Shouldn't happen, but...
00433     if (!srcCount || !maxBytes)
00434         return 0;
00435 
00436     //
00437     //  Get pointers to our start and end points of the input and output
00438     //  buffers.
00439     //
00440     const XMLCh*    srcPtr = srcData;
00441     const XMLCh*    srcEnd = srcPtr + srcCount;
00442     XMLByte*        outPtr = toFill;
00443     XMLByte*        outEnd = toFill + maxBytes;
00444 
00445     while (srcPtr < srcEnd)
00446     {
00447         //
00448         //  Tentatively get the next char out. We have to get it into a
00449         //  32 bit value, because it could be a surrogate pair.
00450         //
00451         XMLUInt32 curVal = *srcPtr;
00452 
00453         //
00454         //  If its a leading surrogate, then lets see if we have the trailing
00455         //  available. If not, then give up now and leave it for next time.
00456         //
00457         unsigned int srcUsed = 1;
00458         if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
00459         {
00460             if (srcPtr + 1 >= srcEnd)
00461                 break;
00462 
00463             // Create the composite surrogate pair
00464             curVal = ((curVal - 0xD800) << 10)
00465                     + ((*(srcPtr + 1) - 0xDC00) + 0x10000);
00466 
00467             // And indicate that we ate another one
00468             srcUsed++;
00469         }
00470 
00471         // Figure out how many bytes we need
00472         unsigned int encodedBytes;
00473         if (curVal < 0x80)
00474             encodedBytes = 1;
00475         else if (curVal < 0x800)
00476             encodedBytes = 2;
00477         else if (curVal < 0x10000)
00478             encodedBytes = 3;
00479         else if (curVal < 0x110000)
00480             encodedBytes = 4;
00481         else
00482         {
00483             // If the options say to throw, then throw
00484             if (options == UnRep_Throw)
00485             {
00486                 XMLCh tmpBuf[17];
00487                 XMLString::binToText(curVal, tmpBuf, 16, 16, getMemoryManager());
00488                 ThrowXMLwithMemMgr2
00489                 (
00490                     TranscodingException
00491                     , XMLExcepts::Trans_Unrepresentable
00492                     , tmpBuf
00493                     , getEncodingName()
00494                     , getMemoryManager()
00495                 );
00496             }
00497 
00498             // Else, use the replacement character
00499             *outPtr++ = chSpace;
00500             srcPtr += srcUsed;
00501             continue;
00502         }
00503 
00504         //
00505         //  If we cannot fully get this char into the output buffer,
00506         //  then leave it for the next time.
00507         //
00508         if (outPtr + encodedBytes > outEnd)
00509             break;
00510 
00511         // We can do it, so update the source index
00512         srcPtr += srcUsed;
00513 
00514         //
00515         //  And spit out the bytes. We spit them out in reverse order
00516         //  here, so bump up the output pointer and work down as we go.
00517         //
00518         outPtr += encodedBytes;
00519         switch(encodedBytes)
00520         {
00521             case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
00522                      curVal >>= 6;
00523             case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
00524                      curVal >>= 6;
00525             case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
00526                      curVal >>= 6;
00527             case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
00528                      curVal >>= 6;
00529             case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
00530                      curVal >>= 6;
00531             case 1 : *--outPtr = XMLByte
00532                      (
00533                         curVal | gFirstByteMark[encodedBytes]
00534                      );
00535         }
00536 
00537         // Add the encoded bytes back in again to indicate we've eaten them
00538         outPtr += encodedBytes;
00539     }
00540 
00541     // Fill in the chars we ate
00542     charsEaten = (srcPtr - srcData);
00543 
00544     // And return the bytes we filled in
00545     return (outPtr - toFill);
00546 }
00547 
00548 
00549 bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck)
00550 {
00551     // We can represent anything in the Unicode (with surrogates) range
00552     return (toCheck <= 0x10FFFF);
00553 }
00554 
00555 XERCES_CPP_NAMESPACE_END
00556