GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/util/TranscodingException.hpp> 00026 #include <xercesc/util/XMLString.hpp> 00027 #include <xercesc/util/XMLUniDefs.hpp> 00028 #include <xercesc/util/XMLUTF8Transcoder.hpp> 00029 00030 XERCES_CPP_NAMESPACE_BEGIN 00031 00032 // --------------------------------------------------------------------------- 00033 // Local static data 00034 // 00035 // gUTFBytes 00036 // A list of counts of trailing bytes for each initial byte in the input. 00037 // 00038 // gUTFByteIndicator 00039 // For a UTF8 sequence of n bytes, n>=2, the first byte of the 00040 // sequence must contain n 1's followed by precisely 1 0 with the 00041 // rest of the byte containing arbitrary bits. This array stores 00042 // the required bit pattern for validity checking. 00043 // gUTFByteIndicatorTest 00044 // When bitwise and'd with the observed value, if the observed 00045 // value is correct then a result matching gUTFByteIndicator will 00046 // be produced. 00047 // 00048 // gUTFOffsets 00049 // A list of values to offset each result char type, according to how 00050 // many source bytes when into making it. 00051 // 00052 // gFirstByteMark 00053 // A list of values to mask onto the first byte of an encoded sequence, 00054 // indexed by the number of bytes used to create the sequence. 00055 // --------------------------------------------------------------------------- 00056 static const XMLByte gUTFBytes[256] = 00057 { 00058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00059 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00060 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00061 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00062 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00063 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00064 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00065 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00066 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00067 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00068 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00069 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 00070 , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 00071 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 00072 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 00073 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 00074 }; 00075 00076 static const XMLByte gUTFByteIndicator[6] = 00077 { 00078 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 00079 }; 00080 static const XMLByte gUTFByteIndicatorTest[6] = 00081 { 00082 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE 00083 }; 00084 00085 static const XMLUInt32 gUTFOffsets[6] = 00086 { 00087 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 00088 }; 00089 00090 static const XMLByte gFirstByteMark[7] = 00091 { 00092 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 00093 }; 00094 00095 00096 00097 // --------------------------------------------------------------------------- 00098 // XMLUTF8Transcoder: Constructors and Destructor 00099 // --------------------------------------------------------------------------- 00100 XMLUTF8Transcoder::XMLUTF8Transcoder(const XMLCh* const encodingName 00101 , const XMLSize_t blockSize 00102 , MemoryManager* const manager) 00103 :XMLTranscoder(encodingName, blockSize, manager) 00104 { 00105 } 00106 00107 XMLUTF8Transcoder::~XMLUTF8Transcoder() 00108 { 00109 } 00110 00111 00112 // --------------------------------------------------------------------------- 00113 // XMLUTF8Transcoder: Implementation of the transcoder API 00114 // --------------------------------------------------------------------------- 00115 XMLSize_t 00116 XMLUTF8Transcoder::transcodeFrom(const XMLByte* const srcData 00117 , const XMLSize_t srcCount 00118 , XMLCh* const toFill 00119 , const XMLSize_t maxChars 00120 , XMLSize_t& bytesEaten 00121 , unsigned char* const charSizes) 00122 { 00123 // Watch for pathological scenario. Shouldn't happen, but... 00124 if (!srcCount || !maxChars) 00125 return 0; 00126 00127 // 00128 // Get pointers to our start and end points of the input and output 00129 // buffers. 00130 // 00131 const XMLByte* srcPtr = srcData; 00132 const XMLByte* srcEnd = srcPtr + srcCount; 00133 XMLCh* outPtr = toFill; 00134 XMLCh* outEnd = outPtr + maxChars; 00135 unsigned char* sizePtr = charSizes; 00136 00137 00138 00139 // 00140 // We now loop until we either run out of input data, or room to store 00141 // output chars. 00142 // 00143 while ((srcPtr < srcEnd) && (outPtr < outEnd)) 00144 { 00145 // Special-case ASCII, which is a leading byte value of <= 127 00146 if (*srcPtr <= 127) 00147 { 00148 // Handle ASCII in groups instead of single character at a time. 00149 const XMLByte* srcPtr_save = srcPtr; 00150 const XMLSize_t chunkSize = (srcEnd-srcPtr)<(outEnd-outPtr)?(srcEnd-srcPtr):(outEnd-outPtr); 00151 for(XMLSize_t i=0;i<chunkSize && *srcPtr <= 127;++i) 00152 *outPtr++ = XMLCh(*srcPtr++); 00153 memset(sizePtr,1,srcPtr - srcPtr_save); 00154 sizePtr += srcPtr - srcPtr_save; 00155 if (srcPtr == srcEnd || outPtr == outEnd) 00156 break; 00157 } 00158 00159 // See how many trailing src bytes this sequence is going to require 00160 const unsigned int trailingBytes = gUTFBytes[*srcPtr]; 00161 00162 // 00163 // If there are not enough source bytes to do this one, then we 00164 // are done. Note that we done >= here because we are implicitly 00165 // counting the 1 byte we get no matter what. 00166 // 00167 // If we break out here, then there is nothing to undo since we 00168 // haven't updated any pointers yet. 00169 // 00170 if (srcPtr + trailingBytes >= srcEnd) 00171 break; 00172 00173 // Looks ok, so lets build up the value 00174 // or at least let's try to do so--remembering that 00175 // we cannot assume the encoding to be valid: 00176 00177 // first, test first byte 00178 if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) { 00179 char pos[2] = {(char)0x31, 0}; 00180 char len[2] = {(char)(trailingBytes+0x31), 0}; 00181 char byte[2] = {*srcPtr,0}; 00182 ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager()); 00183 } 00184 00185 /*** 00186 * http://www.unicode.org/reports/tr27/ 00187 * 00188 * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. 00189 * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) 00190 * is legal in that position. 00191 * Any byte value outside of the ranges listed is illegal. 00192 * For example, 00193 * the byte sequence <C0 AF> is illegal since C0 is not legal in the 1st Byte column. 00194 * The byte sequence <E0 9F 80> is illegal since in the row 00195 * where E0 is legal as a first byte, 00196 * 9F is not legal as a second byte. 00197 * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches 00198 * a byte range in a row of the table (the last row). 00199 * 00200 * 00201 * Table 3.1B. Legal UTF-8 Byte Sequences 00202 * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte 00203 * ========================================================================= 00204 * U+0000..U+007F 00..7F 00205 * ------------------------------------------------------------------------- 00206 * U+0080..U+07FF C2..DF 80..BF 00207 * 00208 * ------------------------------------------------------------------------- 00209 * U+0800..U+0FFF E0 A0..BF 80..BF 00210 * -- 00211 * 00212 * U+1000..U+FFFF E1..EF 80..BF 80..BF 00213 * 00214 * -------------------------------------------------------------------------- 00215 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 00216 * -- 00217 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 00218 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 00219 * -- 00220 * ========================================================================== 00221 * 00222 * Cases where a trailing byte range is not 80..BF are underlined in the table to 00223 * draw attention to them. These occur only in the second byte of a sequence. 00224 * 00225 ***/ 00226 00227 XMLUInt32 tmpVal = 0; 00228 00229 switch(trailingBytes) 00230 { 00231 case 1 : 00232 // UTF-8: [110y yyyy] [10xx xxxx] 00233 // Unicode: [0000 0yyy] [yyxx xxxx] 00234 // 00235 // 0xC0, 0xC1 has been filtered out 00236 checkTrailingBytes(*(srcPtr+1), 1, 1); 00237 00238 tmpVal = *srcPtr++; 00239 tmpVal <<= 6; 00240 tmpVal += *srcPtr++; 00241 00242 break; 00243 case 2 : 00244 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] 00245 // Unicode: [zzzz yyyy] [yyxx xxxx] 00246 // 00247 if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) 00248 { 00249 char byte0[2] = {*srcPtr ,0}; 00250 char byte1[2] = {*(srcPtr+1),0}; 00251 00252 ThrowXMLwithMemMgr2(UTFDataFormatException 00253 , XMLExcepts::UTF8_Invalid_3BytesSeq 00254 , byte0 00255 , byte1 00256 , getMemoryManager()); 00257 } 00258 00259 checkTrailingBytes(*(srcPtr+1), 2, 1); 00260 checkTrailingBytes(*(srcPtr+2), 2, 2); 00261 00262 // 00263 // D36 (a) UTF-8 is the Unicode Transformation Format that serializes 00264 // a Unicode code point as a sequence of one to four bytes, 00265 // as specified in Table 3.1, UTF-8 Bit Distribution. 00266 // (b) An illegal UTF-8 code unit sequence is any byte sequence that 00267 // does not match the patterns listed in Table 3.1B, Legal UTF-8 00268 // Byte Sequences. 00269 // (c) An irregular UTF-8 code unit sequence is a six-byte sequence 00270 // where the first three bytes correspond to a high surrogate, 00271 // and the next three bytes correspond to a low surrogate. 00272 // As a consequence of C12, these irregular UTF-8 sequences shall 00273 // not be generated by a conformant process. 00274 // 00275 //irregular three bytes sequence 00276 // that is zzzzyy matches leading surrogate tag 110110 or 00277 // trailing surrogate tag 110111 00278 // *srcPtr=1110 1101 00279 // *(srcPtr+1)=1010 yyyy or 00280 // *(srcPtr+1)=1011 yyyy 00281 // 00282 // 0xED 1110 1101 00283 // 0xA0 1010 0000 00284 00285 if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0)) 00286 { 00287 char byte0[2] = {*srcPtr, 0}; 00288 char byte1[2] = {*(srcPtr+1),0}; 00289 00290 ThrowXMLwithMemMgr2(UTFDataFormatException 00291 , XMLExcepts::UTF8_Irregular_3BytesSeq 00292 , byte0 00293 , byte1 00294 , getMemoryManager()); 00295 } 00296 00297 tmpVal = *srcPtr++; 00298 tmpVal <<= 6; 00299 tmpVal += *srcPtr++; 00300 tmpVal <<= 6; 00301 tmpVal += *srcPtr++; 00302 00303 break; 00304 case 3 : 00305 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 00306 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 00307 // [1101 11yy] [yyxx xxxx] (low surrogate) 00308 // * uuuuu = wwww + 1 00309 // 00310 if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) || 00311 ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F)) ) 00312 { 00313 char byte0[2] = {*srcPtr ,0}; 00314 char byte1[2] = {*(srcPtr+1),0}; 00315 00316 ThrowXMLwithMemMgr2(UTFDataFormatException 00317 , XMLExcepts::UTF8_Invalid_4BytesSeq 00318 , byte0 00319 , byte1 00320 , getMemoryManager()); 00321 } 00322 00323 checkTrailingBytes(*(srcPtr+1), 3, 1); 00324 checkTrailingBytes(*(srcPtr+2), 3, 2); 00325 checkTrailingBytes(*(srcPtr+3), 3, 3); 00326 00327 tmpVal = *srcPtr++; 00328 tmpVal <<= 6; 00329 tmpVal += *srcPtr++; 00330 tmpVal <<= 6; 00331 tmpVal += *srcPtr++; 00332 tmpVal <<= 6; 00333 tmpVal += *srcPtr++; 00334 00335 break; 00336 default: // trailingBytes > 3 00337 00338 /*** 00339 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows 00340 * for the use of five- and six-byte sequences to encode characters that 00341 * are outside the range of the Unicode character set; those five- and 00342 * six-byte sequences are illegal for the use of UTF-8 as a transformation 00343 * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired 00344 * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters). 00345 ***/ 00346 char len[2] = {(char)(trailingBytes+0x31), 0}; 00347 char byte[2] = {*srcPtr,0}; 00348 00349 ThrowXMLwithMemMgr2(UTFDataFormatException 00350 , XMLExcepts::UTF8_Exceeds_BytesLimit 00351 , byte 00352 , len 00353 , getMemoryManager()); 00354 00355 break; 00356 } 00357 00358 00359 // since trailingBytes comes from an array, this logic is redundant 00360 // default : 00361 // ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq); 00362 //} 00363 tmpVal -= gUTFOffsets[trailingBytes]; 00364 00365 // 00366 // If it will fit into a single char, then put it in. Otherwise 00367 // encode it as a surrogate pair. If its not valid, use the 00368 // replacement char. 00369 // 00370 if (!(tmpVal & 0xFFFF0000)) 00371 { 00372 *sizePtr++ = trailingBytes + 1; 00373 *outPtr++ = XMLCh(tmpVal); 00374 } 00375 else if (tmpVal > 0x10FFFF) 00376 { 00377 // 00378 // If we've gotten more than 32 chars so far, then just break 00379 // out for now and lets process those. When we come back in 00380 // here again, we'll get no chars and throw an exception. This 00381 // way, the error will have a line and col number closer to 00382 // the real problem area. 00383 // 00384 if ((outPtr - toFill) > 32) 00385 break; 00386 00387 ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager()); 00388 } 00389 else 00390 { 00391 // 00392 // If we have enough room to store the leading and trailing 00393 // chars, then lets do it. Else, pretend this one never 00394 // happened, and leave it for the next time. Since we don't 00395 // update the bytes read until the bottom of the loop, by 00396 // breaking out here its like it never happened. 00397 // 00398 if (outPtr + 1 >= outEnd) 00399 break; 00400 00401 // Store the leading surrogate char 00402 tmpVal -= 0x10000; 00403 *sizePtr++ = trailingBytes + 1; 00404 *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800); 00405 00406 // 00407 // And then the trailing char. This one accounts for no 00408 // bytes eaten from the source, so set the char size for this 00409 // one to be zero. 00410 // 00411 *sizePtr++ = 0; 00412 *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00); 00413 } 00414 } 00415 00416 // Update the bytes eaten 00417 bytesEaten = srcPtr - srcData; 00418 00419 // Return the characters read 00420 return outPtr - toFill; 00421 } 00422 00423 00424 XMLSize_t 00425 XMLUTF8Transcoder::transcodeTo( const XMLCh* const srcData 00426 , const XMLSize_t srcCount 00427 , XMLByte* const toFill 00428 , const XMLSize_t maxBytes 00429 , XMLSize_t& charsEaten 00430 , const UnRepOpts options) 00431 { 00432 // Watch for pathological scenario. Shouldn't happen, but... 00433 if (!srcCount || !maxBytes) 00434 return 0; 00435 00436 // 00437 // Get pointers to our start and end points of the input and output 00438 // buffers. 00439 // 00440 const XMLCh* srcPtr = srcData; 00441 const XMLCh* srcEnd = srcPtr + srcCount; 00442 XMLByte* outPtr = toFill; 00443 XMLByte* outEnd = toFill + maxBytes; 00444 00445 while (srcPtr < srcEnd) 00446 { 00447 // 00448 // Tentatively get the next char out. We have to get it into a 00449 // 32 bit value, because it could be a surrogate pair. 00450 // 00451 XMLUInt32 curVal = *srcPtr; 00452 00453 // 00454 // If its a leading surrogate, then lets see if we have the trailing 00455 // available. If not, then give up now and leave it for next time. 00456 // 00457 unsigned int srcUsed = 1; 00458 if ((curVal >= 0xD800) && (curVal <= 0xDBFF)) 00459 { 00460 if (srcPtr + 1 >= srcEnd) 00461 break; 00462 00463 // Create the composite surrogate pair 00464 curVal = ((curVal - 0xD800) << 10) 00465 + ((*(srcPtr + 1) - 0xDC00) + 0x10000); 00466 00467 // And indicate that we ate another one 00468 srcUsed++; 00469 } 00470 00471 // Figure out how many bytes we need 00472 unsigned int encodedBytes; 00473 if (curVal < 0x80) 00474 encodedBytes = 1; 00475 else if (curVal < 0x800) 00476 encodedBytes = 2; 00477 else if (curVal < 0x10000) 00478 encodedBytes = 3; 00479 else if (curVal < 0x110000) 00480 encodedBytes = 4; 00481 else 00482 { 00483 // If the options say to throw, then throw 00484 if (options == UnRep_Throw) 00485 { 00486 XMLCh tmpBuf[17]; 00487 XMLString::binToText(curVal, tmpBuf, 16, 16, getMemoryManager()); 00488 ThrowXMLwithMemMgr2 00489 ( 00490 TranscodingException 00491 , XMLExcepts::Trans_Unrepresentable 00492 , tmpBuf 00493 , getEncodingName() 00494 , getMemoryManager() 00495 ); 00496 } 00497 00498 // Else, use the replacement character 00499 *outPtr++ = chSpace; 00500 srcPtr += srcUsed; 00501 continue; 00502 } 00503 00504 // 00505 // If we cannot fully get this char into the output buffer, 00506 // then leave it for the next time. 00507 // 00508 if (outPtr + encodedBytes > outEnd) 00509 break; 00510 00511 // We can do it, so update the source index 00512 srcPtr += srcUsed; 00513 00514 // 00515 // And spit out the bytes. We spit them out in reverse order 00516 // here, so bump up the output pointer and work down as we go. 00517 // 00518 outPtr += encodedBytes; 00519 switch(encodedBytes) 00520 { 00521 case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); 00522 curVal >>= 6; 00523 case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); 00524 curVal >>= 6; 00525 case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); 00526 curVal >>= 6; 00527 case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); 00528 curVal >>= 6; 00529 case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); 00530 curVal >>= 6; 00531 case 1 : *--outPtr = XMLByte 00532 ( 00533 curVal | gFirstByteMark[encodedBytes] 00534 ); 00535 } 00536 00537 // Add the encoded bytes back in again to indicate we've eaten them 00538 outPtr += encodedBytes; 00539 } 00540 00541 // Fill in the chars we ate 00542 charsEaten = (srcPtr - srcData); 00543 00544 // And return the bytes we filled in 00545 return (outPtr - toFill); 00546 } 00547 00548 00549 bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck) 00550 { 00551 // We can represent anything in the Unicode (with surrogates) range 00552 return (toCheck <= 0x10FFFF); 00553 } 00554 00555 XERCES_CPP_NAMESPACE_END 00556