GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00023 // --------------------------------------------------------------------------- 00024 // Includes 00025 // --------------------------------------------------------------------------- 00026 #include <xercesc/framework/XMLRecognizer.hpp> 00027 #include <xercesc/util/RuntimeException.hpp> 00028 #include <xercesc/util/XMLString.hpp> 00029 00030 XERCES_CPP_NAMESPACE_BEGIN 00031 00032 // --------------------------------------------------------------------------- 00033 // Local data 00034 // 00035 // gEncodingNameMap 00036 // This array maps the Encodings enum values to their canonical names. 00037 // Be sure to keep this in sync with that enum! 00038 // --------------------------------------------------------------------------- 00039 static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] = 00040 { 00041 XMLUni::fgEBCDICEncodingString 00042 , XMLUni::fgUCS4BEncodingString 00043 , XMLUni::fgUCS4LEncodingString 00044 , XMLUni::fgUSASCIIEncodingString 00045 , XMLUni::fgUTF8EncodingString 00046 , XMLUni::fgUTF16BEncodingString 00047 , XMLUni::fgUTF16LEncodingString 00048 , XMLUni::fgXMLChEncodingString 00049 }; 00050 00051 00052 00053 // --------------------------------------------------------------------------- 00054 // XMLRecognizer: Public, const static data 00055 // 00056 // gXXXPre 00057 // gXXXPreLen 00058 // The byte sequence prefixes for all of the encodings that we can 00059 // auto sense. Also included is the length of each sequence. 00060 // --------------------------------------------------------------------------- 00061 const char XMLRecognizer::fgASCIIPre[] = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 }; 00062 const XMLSize_t XMLRecognizer::fgASCIIPreLen = 6; 00063 const XMLByte XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 }; 00064 const XMLSize_t XMLRecognizer::fgEBCDICPreLen = 6; 00065 const XMLByte XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 }; 00066 const XMLByte XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 }; 00067 const XMLSize_t XMLRecognizer::fgUTF16PreLen = 12; 00068 const XMLByte XMLRecognizer::fgUCS4BPre[] = 00069 { 00070 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F 00071 , 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D 00072 , 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20 00073 }; 00074 const XMLByte XMLRecognizer::fgUCS4LPre[] = 00075 { 00076 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00 00077 , 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00 00078 , 0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00 00079 }; 00080 const XMLSize_t XMLRecognizer::fgUCS4PreLen = 24; 00081 00082 const char XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF}; 00083 const XMLSize_t XMLRecognizer::fgUTF8BOMLen = 3; 00084 00085 // --------------------------------------------------------------------------- 00086 // XMLRecognizer: Encoding recognition methods 00087 // --------------------------------------------------------------------------- 00088 XMLRecognizer::Encodings 00089 XMLRecognizer::basicEncodingProbe( const XMLByte* const rawBuffer 00090 , const XMLSize_t rawByteCount) 00091 { 00092 // 00093 // As an optimization to check the 90% case, check first for the ASCII 00094 // sequence '<?xml', which means its either US-ASCII, UTF-8, or some 00095 // other encoding that we don't do manually but which happens to share 00096 // the US-ASCII code points for these characters. So just return UTF-8 00097 // to get us through the first line. 00098 // 00099 if (rawByteCount >= fgASCIIPreLen) 00100 { 00101 if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen)) 00102 return UTF_8; 00103 } 00104 00105 // 00106 // If the count of raw bytes is less than 2, it cannot be anything 00107 // we understand, so return UTF-8 as a fallback. 00108 // 00109 if (rawByteCount < 2) 00110 return UTF_8; 00111 00112 // 00113 // We have two to four bytes, so lets check for a UTF-16 BOM. That 00114 // is quick to check and enough to identify two major encodings. 00115 // 00116 00117 if (rawByteCount < 4) 00118 { 00119 if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) 00120 return UTF_16B; 00121 else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) 00122 return UTF_16L; 00123 else 00124 return UTF_8; 00125 } 00126 00127 /*** 00128 * F.1 Detection Without External Encoding Information 00129 * 00130 * Because each XML entity not accompanied by external encoding information and 00131 * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, 00132 * in which the first characters must be '<?xml', any conforming processor can detect, 00133 * after two to four octets of input, which of the following cases apply. 00134 * 00135 * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and 00136 * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is 00137 * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive 00138 * ##s cannot be both 00. 00139 * 00140 * With a Byte Order Mark: 00141 * 00142 * 00 00 FE FF UCS-4, big-endian machine (1234 order) 00143 * FF FE 00 00 UCS-4, little-endian machine (4321 order) 00144 * 00 00 FF FE UCS-4, unusual octet order (2143) 00145 * FE FF 00 00 UCS-4, unusual octet order (3412) 00146 * FE FF ## ## UTF-16, big-endian 00147 * FF FE ## ## UTF-16, little-endian 00148 * EF BB BF UTF-8 00149 * 00150 ***/ 00151 00152 // 00153 // We have at least four bytes, so we can check all BOM 00154 // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well. 00155 // 00156 if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF)) 00157 return UCS_4B; 00158 else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00)) 00159 return UCS_4L; 00160 else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) 00161 return UTF_16B; 00162 else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) 00163 return UTF_16L; 00164 00165 // 00166 // We have at least 4 bytes. So lets check the 4 byte sequences that 00167 // indicate other UTF-16 and UCS encodings. 00168 // 00169 if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C)) 00170 { 00171 if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen)) 00172 return UCS_4B; 00173 else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen)) 00174 return UCS_4L; 00175 else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen)) 00176 return UTF_16B; 00177 else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen)) 00178 return UTF_16L; 00179 } 00180 00181 // 00182 // See if we have enough bytes to possibly match the EBCDIC prefix. 00183 // If so, try it. 00184 // 00185 if (rawByteCount > fgEBCDICPreLen) 00186 { 00187 if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen)) 00188 return EBCDIC; 00189 } 00190 00191 // 00192 // Does not seem to be anything we know, so go with UTF-8 to get at 00193 // least through the first line and see what it really is. 00194 // 00195 return UTF_8; 00196 } 00197 00198 00199 XMLRecognizer::Encodings 00200 XMLRecognizer::encodingForName(const XMLCh* const encName) 00201 { 00202 // 00203 // Compare the passed string, assume input string is already uppercased, 00204 // to the variations that we recognize. 00205 // 00206 // !!NOTE: Note that we don't handle EBCDIC here because we don't handle 00207 // that one ourselves. It is allowed to fall into 'other'. 00208 // 00209 if (encName == XMLUni::fgXMLChEncodingString || 00210 !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString)) 00211 { 00212 return XMLRecognizer::XERCES_XMLCH; 00213 } 00214 else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString) 00215 || !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2)) 00216 { 00217 return XMLRecognizer::UTF_8; 00218 } 00219 else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString) 00220 || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2) 00221 || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3) 00222 || !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4)) 00223 { 00224 return XMLRecognizer::US_ASCII; 00225 } 00226 else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString) 00227 || !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2)) 00228 { 00229 return XMLRecognizer::UTF_16L; 00230 } 00231 else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString) 00232 || !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2)) 00233 { 00234 return XMLRecognizer::UTF_16B; 00235 } 00236 else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString)) 00237 { 00238 return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L; 00239 } 00240 else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString) 00241 || !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2)) 00242 { 00243 return XMLRecognizer::UCS_4L; 00244 } 00245 else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString) 00246 || !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2)) 00247 { 00248 return XMLRecognizer::UCS_4B; 00249 } 00250 else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString)) 00251 { 00252 return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L; 00253 } 00254 00255 // Return 'other' since we don't recognizer it 00256 return XMLRecognizer::OtherEncoding; 00257 } 00258 00259 00260 const XMLCh* 00261 XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding 00262 , MemoryManager* const manager) 00263 { 00264 if (theEncoding >= Encodings_Count) 00265 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager); 00266 00267 return gEncodingNameMap[theEncoding]; 00268 } 00269 00270 XERCES_CPP_NAMESPACE_END