GME  13
UnicodeRangeFactory.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: UnicodeRangeFactory.cpp 678879 2008-07-22 20:05:05Z amassari $
00020  */
00021 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/util/regx/UnicodeRangeFactory.hpp>
00026 #include <xercesc/util/regx/TokenFactory.hpp>
00027 #include <xercesc/util/regx/RangeToken.hpp>
00028 #include <xercesc/util/regx/RangeTokenMap.hpp>
00029 #include <xercesc/util/regx/RegxDefs.hpp>
00030 #include <xercesc/util/regx/XMLUniCharacter.hpp>
00031 
00032 XERCES_CPP_NAMESPACE_BEGIN
00033 
00034 // ---------------------------------------------------------------------------
00035 //  Local data
00036 // ---------------------------------------------------------------------------
00037 
00038 const XMLCh uniCategNames[][3] =
00039 {
00040     {chLatin_C, chLatin_n, chNull},     // UNASSIGNED
00041     {chLatin_L, chLatin_u, chNull},     // UPPERCASE_LETTER
00042     {chLatin_L, chLatin_l, chNull},     // LOWERCASE_LETTER
00043     {chLatin_L, chLatin_t, chNull},     // TITLECASE_LETTER
00044     {chLatin_L, chLatin_m, chNull},     // MODIFIER_LETTER
00045     {chLatin_L, chLatin_o, chNull},     // OTHER_LETTER
00046     {chLatin_M, chLatin_n, chNull},     // NON_SPACING_MARK
00047     {chLatin_M, chLatin_e, chNull},     // ENCLOSING_MARK
00048     {chLatin_M, chLatin_c, chNull},     // COMBINING_SPACING_MARK
00049     {chLatin_N, chLatin_d, chNull},     // DECIMAL_DIGIT_NUMBER
00050     {chLatin_N, chLatin_l, chNull},     // LETTER_NUMBER
00051     {chLatin_N, chLatin_o, chNull},     // OTHER_NUMBER
00052     {chLatin_Z, chLatin_s, chNull},     // SPACE_SEPARATOR
00053     {chLatin_Z, chLatin_l, chNull},     // LINE_SEPARATOR
00054     {chLatin_Z, chLatin_p, chNull},     // PARAGRAPH_SEPARATOR
00055     {chLatin_C, chLatin_c, chNull},     // CONTROL
00056     {chLatin_C, chLatin_f, chNull},     // FORMAT
00057     {chLatin_C, chLatin_o, chNull},     // PRIVATE_USE
00058     {chLatin_C, chLatin_s, chNull},     // SURROGATE
00059     {chLatin_P, chLatin_d, chNull},     // DASH_PUNCTUATION
00060     {chLatin_P, chLatin_s, chNull},     // START_PUNCTUATION
00061     {chLatin_P, chLatin_e, chNull},     // END_PUNCTUATION
00062     {chLatin_P, chLatin_c, chNull},     // CONNECTOR_PUNCTUATION
00063     {chLatin_P, chLatin_o, chNull},     // OTHER_PUNCTUATION
00064     {chLatin_S, chLatin_m, chNull},     // MATH_SYMBOL
00065     {chLatin_S, chLatin_c, chNull},     // CURRENCY_SYMBOL
00066     {chLatin_S, chLatin_k, chNull},     // MODIFIER_SYMBOL
00067     {chLatin_S, chLatin_o, chNull},     // OTHER_SYMBOL
00068     {chLatin_P, chLatin_i, chNull},     // INITIAL_PUNCTUATION
00069     {chLatin_P, chLatin_f, chNull},     // FINAL_PUNCTUATION
00070     {chLatin_L, chNull},                // CHAR_LETTER
00071     {chLatin_M, chNull},                // CHAR_MARK
00072     {chLatin_N, chNull},                // CHAR_NUMBER
00073     {chLatin_Z, chNull},                // CHAR_SEPARATOR
00074     {chLatin_C, chNull},                // CHAR_OTHER
00075     {chLatin_P, chNull},                // CHAR_PUNCTUATION
00076     {chLatin_S, chNull},                // CHAR_SYMBOL
00077 };
00078 
00079 // ---------------------------------------------------------------------------
00080 //  UnicodeRangeFactory: Constructors and Destructor
00081 // ---------------------------------------------------------------------------
00082 UnicodeRangeFactory::UnicodeRangeFactory()
00083 {
00084 }
00085 
00086 UnicodeRangeFactory::~UnicodeRangeFactory() {
00087 
00088 }
00089 
00090 // ---------------------------------------------------------------------------
00091 //  UnicodeRangeFactory: Range creation methods
00092 // ---------------------------------------------------------------------------
00093 void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) {
00094 
00095     if (fRangesCreated)
00096         return;
00097 
00098     if (!fKeywordsInitialized) {
00099         initializeKeywordMap(rangeTokMap);
00100     }
00101 
00102     TokenFactory* tokFactory = rangeTokMap->getTokenFactory();
00103     RangeToken* ranges[UNICATEGSIZE];
00104     RangeToken* tok;
00105 
00106     for (int i=0; i < UNICATEGSIZE; i++) {
00107         ranges[i] = tokFactory->createRange();
00108     }
00109 
00110     for (int j=0; j < 0x10000; j++) {
00111 
00112         unsigned short charType = XMLUniCharacter::getType(j);
00113 
00114         ranges[charType]->addRange(j, j);
00115         charType = getUniCategory(charType);
00116         ranges[charType]->addRange(j, j);
00117     }
00118 
00119     ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX);
00120 
00121     for (int k=0; k < UNICATEGSIZE; k++) {
00122         tok = RangeToken::complementRanges(ranges[k], tokFactory);
00123         // build the internal map.
00124         tok->createMap();
00125         rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]);
00126         rangeTokMap->setRangeToken(uniCategNames[k], tok , true);
00127     }
00128 
00129     // Create all range
00130     tok = tokFactory->createRange();
00131     tok->addRange(0, Token::UTF16_MAX);
00132     // build the internal map.
00133     tok->createMap();
00134     rangeTokMap->setRangeToken(fgUniAll, tok);
00135 
00136     // Create alpha range
00137     tok = tokFactory->createRange();
00138     tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]);
00139     tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]);
00140     tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]);
00141     // build the internal map.
00142     tok->createMap();
00143     rangeTokMap->setRangeToken(fgUniIsAlpha, tok);
00144 
00145     // Create alpha-num range
00146     RangeToken* alnumTok = tokFactory->createRange();
00147     alnumTok->mergeRanges(tok);
00148     alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]);
00149     // build the internal map.
00150     alnumTok->createMap();
00151     rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok);
00152 
00153     // Create word range
00154     tok = tokFactory->createRange();
00155     tok->mergeRanges(alnumTok);
00156     tok->addRange(chUnderscore, chUnderscore);
00157     // build the internal map.
00158     tok->createMap();
00159     rangeTokMap->setRangeToken(fgUniIsWord, tok);
00160 
00161     tok = RangeToken::complementRanges(tok, tokFactory);
00162     // build the internal map.
00163     tok->createMap();
00164     rangeTokMap->setRangeToken(fgUniIsWord, tok , true);
00165 
00166     // Create assigned range
00167     tok = RangeToken::complementRanges(
00168                 ranges[XMLUniCharacter::UNASSIGNED],
00169                 tokFactory,
00170                 tokFactory->getMemoryManager());
00171     // build the internal map.
00172     tok->createMap();
00173     rangeTokMap->setRangeToken(fgUniAssigned,tok);
00174 
00175     // Create space range
00176     tok = tokFactory->createRange();
00177     tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]);
00178     tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]);
00179     //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]);
00180     // build the internal map.
00181     tok->createMap();
00182     rangeTokMap->setRangeToken(fgUniIsSpace, tok);
00183 
00184     tok = RangeToken::complementRanges(tok, tokFactory);
00185     // build the internal map.
00186     tok->createMap();
00187     rangeTokMap->setRangeToken(fgUniIsSpace, tok , true);
00188 
00189     RangeToken* const dummyToken =
00190         tokFactory->createRange();
00191 
00192     dummyToken->addRange(-1, -2);
00193     dummyToken->createMap();
00194 
00195     // build the internal maps.
00196     for (int l=0; l < UNICATEGSIZE; l++) {
00197         ranges[l]->createMap();
00198         ranges[l]->setCaseInsensitiveToken(dummyToken);
00199     }
00200 
00201     fRangesCreated = true;
00202 }
00203 
00204 // ---------------------------------------------------------------------------
00205 //  UnicodeRangeFactory: Initialization methods
00206 // ---------------------------------------------------------------------------
00207 void UnicodeRangeFactory::initializeKeywordMap(RangeTokenMap *rangeTokMap) {
00208 
00209     if (fKeywordsInitialized)
00210         return;
00211 
00212     for (int k=0; k < UNICATEGSIZE; k++) {
00213         rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory);
00214     }
00215 
00216     rangeTokMap->addKeywordMap(fgUniAll, fgUnicodeCategory);
00217     rangeTokMap->addKeywordMap(fgUniIsAlpha, fgUnicodeCategory);
00218     rangeTokMap->addKeywordMap(fgUniIsAlnum, fgUnicodeCategory);
00219     rangeTokMap->addKeywordMap(fgUniIsWord, fgUnicodeCategory);
00220     rangeTokMap->addKeywordMap(fgUniAssigned, fgUnicodeCategory);
00221     rangeTokMap->addKeywordMap(fgUniIsSpace, fgUnicodeCategory);
00222 
00223     fKeywordsInitialized = true;
00224 }
00225 
00226 // ---------------------------------------------------------------------------
00227 //  UnicodeRangeFactory: Helper methods
00228 // ---------------------------------------------------------------------------
00229 unsigned short UnicodeRangeFactory::getUniCategory(const unsigned short type)
00230 {
00231     switch(type) {
00232     case XMLUniCharacter::UPPERCASE_LETTER:
00233     case XMLUniCharacter::LOWERCASE_LETTER:
00234     case XMLUniCharacter::TITLECASE_LETTER:
00235     case XMLUniCharacter::MODIFIER_LETTER:
00236     case XMLUniCharacter::OTHER_LETTER:
00237         return CHAR_LETTER;
00238     case XMLUniCharacter::NON_SPACING_MARK:
00239     case XMLUniCharacter::COMBINING_SPACING_MARK:
00240     case XMLUniCharacter::ENCLOSING_MARK:
00241         return CHAR_MARK;
00242     case XMLUniCharacter::DECIMAL_DIGIT_NUMBER:
00243     case XMLUniCharacter::LETTER_NUMBER:
00244     case XMLUniCharacter::OTHER_NUMBER:
00245         return CHAR_NUMBER;
00246     case XMLUniCharacter::SPACE_SEPARATOR:
00247     case XMLUniCharacter::LINE_SEPARATOR:
00248     case XMLUniCharacter::PARAGRAPH_SEPARATOR:
00249         return CHAR_SEPARATOR;
00250     case XMLUniCharacter::CONTROL:
00251     case XMLUniCharacter::FORMAT:
00252     case XMLUniCharacter::SURROGATE:
00253     case XMLUniCharacter::PRIVATE_USE:
00254     case XMLUniCharacter::UNASSIGNED:
00255         return CHAR_OTHER;
00256     case XMLUniCharacter::CONNECTOR_PUNCTUATION:
00257     case XMLUniCharacter::DASH_PUNCTUATION:
00258     case XMLUniCharacter::START_PUNCTUATION:
00259     case XMLUniCharacter::END_PUNCTUATION:
00260     case XMLUniCharacter::OTHER_PUNCTUATION:
00261     case XMLUniCharacter::INITIAL_PUNCTUATION:
00262     case XMLUniCharacter::FINAL_PUNCTUATION:
00263         return CHAR_PUNCTUATION;
00264     case XMLUniCharacter::MATH_SYMBOL:
00265     case XMLUniCharacter::CURRENCY_SYMBOL:
00266     case XMLUniCharacter::MODIFIER_SYMBOL:
00267     case XMLUniCharacter::OTHER_SYMBOL:
00268         return CHAR_SYMBOL;
00269     }
00270 
00271     return 0;
00272 }
00273 
00274 XERCES_CPP_NAMESPACE_END
00275