GME  13
XMLRecognizer.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00023 // ---------------------------------------------------------------------------
00024 //  Includes
00025 // ---------------------------------------------------------------------------
00026 #include <xercesc/framework/XMLRecognizer.hpp>
00027 #include <xercesc/util/RuntimeException.hpp>
00028 #include <xercesc/util/XMLString.hpp>
00029 
00030 XERCES_CPP_NAMESPACE_BEGIN
00031 
00032 // ---------------------------------------------------------------------------
00033 //  Local data
00034 //
00035 //  gEncodingNameMap
00036 //      This array maps the Encodings enum values to their canonical names.
00037 //      Be sure to keep this in sync with that enum!
00038 // ---------------------------------------------------------------------------
00039 static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
00040 {
00041     XMLUni::fgEBCDICEncodingString
00042     , XMLUni::fgUCS4BEncodingString
00043     , XMLUni::fgUCS4LEncodingString
00044     , XMLUni::fgUSASCIIEncodingString
00045     , XMLUni::fgUTF8EncodingString
00046     , XMLUni::fgUTF16BEncodingString
00047     , XMLUni::fgUTF16LEncodingString
00048     , XMLUni::fgXMLChEncodingString
00049 };
00050 
00051 
00052 
00053 // ---------------------------------------------------------------------------
00054 //  XMLRecognizer: Public, const static data
00055 //
00056 //  gXXXPre
00057 //  gXXXPreLen
00058 //      The byte sequence prefixes for all of the encodings that we can
00059 //      auto sense. Also included is the length of each sequence.
00060 // ---------------------------------------------------------------------------
00061 const char           XMLRecognizer::fgASCIIPre[]  = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
00062 const XMLSize_t      XMLRecognizer::fgASCIIPreLen = 6;
00063 const XMLByte        XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
00064 const XMLSize_t      XMLRecognizer::fgEBCDICPreLen = 6;
00065 const XMLByte        XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
00066 const XMLByte        XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
00067 const XMLSize_t      XMLRecognizer::fgUTF16PreLen = 12;
00068 const XMLByte        XMLRecognizer::fgUCS4BPre[]  =
00069 {
00070         0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
00071     ,   0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
00072     ,   0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
00073 };
00074 const XMLByte        XMLRecognizer::fgUCS4LPre[]  =
00075 {
00076         0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
00077     ,   0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
00078     ,   0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
00079 };
00080 const XMLSize_t      XMLRecognizer::fgUCS4PreLen = 24;
00081 
00082 const char           XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
00083 const XMLSize_t      XMLRecognizer::fgUTF8BOMLen = 3;
00084 
00085 // ---------------------------------------------------------------------------
00086 //  XMLRecognizer: Encoding recognition methods
00087 // ---------------------------------------------------------------------------
00088 XMLRecognizer::Encodings
00089 XMLRecognizer::basicEncodingProbe(  const   XMLByte* const  rawBuffer
00090                                     , const XMLSize_t       rawByteCount)
00091 {
00092     //
00093     //  As an optimization to check the 90% case, check first for the ASCII
00094     //  sequence '<?xml', which means its either US-ASCII, UTF-8, or some
00095     //  other encoding that we don't do manually but which happens to share
00096     //  the US-ASCII code points for these characters. So just return UTF-8
00097     //  to get us through the first line.
00098     //
00099     if (rawByteCount >= fgASCIIPreLen)
00100     {
00101         if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
00102             return UTF_8;
00103     }
00104 
00105     //
00106     //  If the count of raw bytes is less than 2, it cannot be anything
00107     //  we understand, so return UTF-8 as a fallback.
00108     //
00109     if (rawByteCount < 2)
00110         return UTF_8;
00111          
00112     //  
00113     //  We have two to four bytes, so lets check for a UTF-16 BOM. That
00114     //  is quick to check and enough to identify two major encodings.   
00115     // 
00116 
00117     if (rawByteCount < 4)
00118     {
00119         if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
00120             return UTF_16B;
00121         else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
00122             return UTF_16L;
00123         else 
00124             return UTF_8;
00125     }
00126 
00127     /***
00128      *    F.1 Detection Without External Encoding Information
00129      *
00130      *    Because each XML entity not accompanied by external encoding information and 
00131      *    not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, 
00132      *    in which the first characters must be '<?xml', any conforming processor can detect, 
00133      *    after two to four octets of input, which of the following cases apply. 
00134      *
00135      *    In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and 
00136      *    '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is 
00137      *    "#xFEFF". The notation ## is used to denote any byte value except that two consecutive 
00138      *    ##s cannot be both 00.
00139      *
00140      *    With a Byte Order Mark:
00141      *
00142      *    00 00 FE FF           UCS-4,    big-endian machine    (1234 order) 
00143      *    FF FE 00 00           UCS-4,    little-endian machine (4321 order) 
00144      *    00 00 FF FE           UCS-4,    unusual octet order   (2143) 
00145      *    FE FF 00 00           UCS-4,    unusual octet order   (3412) 
00146      *    FE FF ## ##           UTF-16,   big-endian 
00147      *    FF FE ## ##           UTF-16,   little-endian 
00148      *    EF BB BF              UTF-8 
00149      *
00150      ***/
00151 
00152     //
00153     //  We have at least four bytes, so we can check all BOM
00154     //  for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
00155     //
00156     if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
00157         return UCS_4B;
00158     else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
00159         return UCS_4L;
00160     else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
00161         return UTF_16B;
00162     else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
00163         return UTF_16L;
00164 
00165     //
00166     //  We have at least 4 bytes. So lets check the 4 byte sequences that
00167     //  indicate other UTF-16 and UCS encodings.
00168     //
00169     if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
00170     {
00171         if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
00172             return UCS_4B;
00173         else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
00174             return UCS_4L;
00175         else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
00176             return UTF_16B;
00177         else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
00178             return UTF_16L;
00179     }
00180 
00181     //
00182     //  See if we have enough bytes to possibly match the EBCDIC prefix.
00183     //  If so, try it.
00184     //
00185     if (rawByteCount > fgEBCDICPreLen)
00186     {
00187         if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
00188             return EBCDIC;
00189     }
00190 
00191     //
00192     //  Does not seem to be anything we know, so go with UTF-8 to get at
00193     //  least through the first line and see what it really is.
00194     //
00195     return UTF_8;
00196 }
00197 
00198 
00199 XMLRecognizer::Encodings
00200 XMLRecognizer::encodingForName(const XMLCh* const encName)
00201 {
00202     //
00203     //  Compare the passed string, assume input string is already uppercased,
00204     //  to the variations that we recognize.
00205     //
00206     //  !!NOTE: Note that we don't handle EBCDIC here because we don't handle
00207     //  that one ourselves. It is allowed to fall into 'other'.
00208     //
00209     if (encName == XMLUni::fgXMLChEncodingString ||
00210         !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
00211     {
00212         return XMLRecognizer::XERCES_XMLCH;
00213     }
00214     else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
00215          ||  !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
00216     {
00217         return XMLRecognizer::UTF_8;
00218     }
00219     else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
00220          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
00221          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
00222          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
00223     {
00224         return XMLRecognizer::US_ASCII;
00225     }
00226     else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
00227          ||  !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
00228     {
00229         return XMLRecognizer::UTF_16L;
00230     }
00231     else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
00232          ||  !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
00233     {
00234         return XMLRecognizer::UTF_16B;
00235     }
00236     else if (!XMLString::compareString(encName, XMLUni::fgUTF16EncodingString))
00237     {
00238         return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UTF_16B:XMLRecognizer::UTF_16L;
00239     }
00240     else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
00241          ||  !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
00242     {
00243         return XMLRecognizer::UCS_4L;
00244     }
00245     else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
00246          ||  !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
00247     {
00248         return XMLRecognizer::UCS_4B;
00249     }
00250     else if (!XMLString::compareString(encName, XMLUni::fgUCS4EncodingString))
00251     {
00252         return XMLPlatformUtils::fgXMLChBigEndian?XMLRecognizer::UCS_4B:XMLRecognizer::UCS_4L;
00253     }
00254 
00255     // Return 'other' since we don't recognizer it
00256     return XMLRecognizer::OtherEncoding;
00257 }
00258 
00259 
00260 const XMLCh*
00261 XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding
00262                                , MemoryManager* const manager)
00263 {
00264     if (theEncoding >= Encodings_Count)
00265         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding, manager);
00266 
00267     return gEncodingNameMap[theEncoding];
00268 }
00269 
00270 XERCES_CPP_NAMESPACE_END