GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: UnicodeRangeFactory.cpp 678879 2008-07-22 20:05:05Z amassari $ 00020 */ 00021 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/util/regx/UnicodeRangeFactory.hpp> 00026 #include <xercesc/util/regx/TokenFactory.hpp> 00027 #include <xercesc/util/regx/RangeToken.hpp> 00028 #include <xercesc/util/regx/RangeTokenMap.hpp> 00029 #include <xercesc/util/regx/RegxDefs.hpp> 00030 #include <xercesc/util/regx/XMLUniCharacter.hpp> 00031 00032 XERCES_CPP_NAMESPACE_BEGIN 00033 00034 // --------------------------------------------------------------------------- 00035 // Local data 00036 // --------------------------------------------------------------------------- 00037 00038 const XMLCh uniCategNames[][3] = 00039 { 00040 {chLatin_C, chLatin_n, chNull}, // UNASSIGNED 00041 {chLatin_L, chLatin_u, chNull}, // UPPERCASE_LETTER 00042 {chLatin_L, chLatin_l, chNull}, // LOWERCASE_LETTER 00043 {chLatin_L, chLatin_t, chNull}, // TITLECASE_LETTER 00044 {chLatin_L, chLatin_m, chNull}, // MODIFIER_LETTER 00045 {chLatin_L, chLatin_o, chNull}, // OTHER_LETTER 00046 {chLatin_M, chLatin_n, chNull}, // NON_SPACING_MARK 00047 {chLatin_M, chLatin_e, chNull}, // ENCLOSING_MARK 00048 {chLatin_M, chLatin_c, chNull}, // COMBINING_SPACING_MARK 00049 {chLatin_N, chLatin_d, chNull}, // DECIMAL_DIGIT_NUMBER 00050 {chLatin_N, chLatin_l, chNull}, // LETTER_NUMBER 00051 {chLatin_N, chLatin_o, chNull}, // OTHER_NUMBER 00052 {chLatin_Z, chLatin_s, chNull}, // SPACE_SEPARATOR 00053 {chLatin_Z, chLatin_l, chNull}, // LINE_SEPARATOR 00054 {chLatin_Z, chLatin_p, chNull}, // PARAGRAPH_SEPARATOR 00055 {chLatin_C, chLatin_c, chNull}, // CONTROL 00056 {chLatin_C, chLatin_f, chNull}, // FORMAT 00057 {chLatin_C, chLatin_o, chNull}, // PRIVATE_USE 00058 {chLatin_C, chLatin_s, chNull}, // SURROGATE 00059 {chLatin_P, chLatin_d, chNull}, // DASH_PUNCTUATION 00060 {chLatin_P, chLatin_s, chNull}, // START_PUNCTUATION 00061 {chLatin_P, chLatin_e, chNull}, // END_PUNCTUATION 00062 {chLatin_P, chLatin_c, chNull}, // CONNECTOR_PUNCTUATION 00063 {chLatin_P, chLatin_o, chNull}, // OTHER_PUNCTUATION 00064 {chLatin_S, chLatin_m, chNull}, // MATH_SYMBOL 00065 {chLatin_S, chLatin_c, chNull}, // CURRENCY_SYMBOL 00066 {chLatin_S, chLatin_k, chNull}, // MODIFIER_SYMBOL 00067 {chLatin_S, chLatin_o, chNull}, // OTHER_SYMBOL 00068 {chLatin_P, chLatin_i, chNull}, // INITIAL_PUNCTUATION 00069 {chLatin_P, chLatin_f, chNull}, // FINAL_PUNCTUATION 00070 {chLatin_L, chNull}, // CHAR_LETTER 00071 {chLatin_M, chNull}, // CHAR_MARK 00072 {chLatin_N, chNull}, // CHAR_NUMBER 00073 {chLatin_Z, chNull}, // CHAR_SEPARATOR 00074 {chLatin_C, chNull}, // CHAR_OTHER 00075 {chLatin_P, chNull}, // CHAR_PUNCTUATION 00076 {chLatin_S, chNull}, // CHAR_SYMBOL 00077 }; 00078 00079 // --------------------------------------------------------------------------- 00080 // UnicodeRangeFactory: Constructors and Destructor 00081 // --------------------------------------------------------------------------- 00082 UnicodeRangeFactory::UnicodeRangeFactory() 00083 { 00084 } 00085 00086 UnicodeRangeFactory::~UnicodeRangeFactory() { 00087 00088 } 00089 00090 // --------------------------------------------------------------------------- 00091 // UnicodeRangeFactory: Range creation methods 00092 // --------------------------------------------------------------------------- 00093 void UnicodeRangeFactory::buildRanges(RangeTokenMap *rangeTokMap) { 00094 00095 if (fRangesCreated) 00096 return; 00097 00098 if (!fKeywordsInitialized) { 00099 initializeKeywordMap(rangeTokMap); 00100 } 00101 00102 TokenFactory* tokFactory = rangeTokMap->getTokenFactory(); 00103 RangeToken* ranges[UNICATEGSIZE]; 00104 RangeToken* tok; 00105 00106 for (int i=0; i < UNICATEGSIZE; i++) { 00107 ranges[i] = tokFactory->createRange(); 00108 } 00109 00110 for (int j=0; j < 0x10000; j++) { 00111 00112 unsigned short charType = XMLUniCharacter::getType(j); 00113 00114 ranges[charType]->addRange(j, j); 00115 charType = getUniCategory(charType); 00116 ranges[charType]->addRange(j, j); 00117 } 00118 00119 ranges[XMLUniCharacter::UNASSIGNED]->addRange(0x10000, Token::UTF16_MAX); 00120 00121 for (int k=0; k < UNICATEGSIZE; k++) { 00122 tok = RangeToken::complementRanges(ranges[k], tokFactory); 00123 // build the internal map. 00124 tok->createMap(); 00125 rangeTokMap->setRangeToken(uniCategNames[k], ranges[k]); 00126 rangeTokMap->setRangeToken(uniCategNames[k], tok , true); 00127 } 00128 00129 // Create all range 00130 tok = tokFactory->createRange(); 00131 tok->addRange(0, Token::UTF16_MAX); 00132 // build the internal map. 00133 tok->createMap(); 00134 rangeTokMap->setRangeToken(fgUniAll, tok); 00135 00136 // Create alpha range 00137 tok = tokFactory->createRange(); 00138 tok->mergeRanges(ranges[XMLUniCharacter::UPPERCASE_LETTER]); 00139 tok->mergeRanges(ranges[XMLUniCharacter::LOWERCASE_LETTER]); 00140 tok->mergeRanges(ranges[XMLUniCharacter::OTHER_LETTER]); 00141 // build the internal map. 00142 tok->createMap(); 00143 rangeTokMap->setRangeToken(fgUniIsAlpha, tok); 00144 00145 // Create alpha-num range 00146 RangeToken* alnumTok = tokFactory->createRange(); 00147 alnumTok->mergeRanges(tok); 00148 alnumTok->mergeRanges(ranges[XMLUniCharacter::DECIMAL_DIGIT_NUMBER]); 00149 // build the internal map. 00150 alnumTok->createMap(); 00151 rangeTokMap->setRangeToken(fgUniIsAlnum, alnumTok); 00152 00153 // Create word range 00154 tok = tokFactory->createRange(); 00155 tok->mergeRanges(alnumTok); 00156 tok->addRange(chUnderscore, chUnderscore); 00157 // build the internal map. 00158 tok->createMap(); 00159 rangeTokMap->setRangeToken(fgUniIsWord, tok); 00160 00161 tok = RangeToken::complementRanges(tok, tokFactory); 00162 // build the internal map. 00163 tok->createMap(); 00164 rangeTokMap->setRangeToken(fgUniIsWord, tok , true); 00165 00166 // Create assigned range 00167 tok = RangeToken::complementRanges( 00168 ranges[XMLUniCharacter::UNASSIGNED], 00169 tokFactory, 00170 tokFactory->getMemoryManager()); 00171 // build the internal map. 00172 tok->createMap(); 00173 rangeTokMap->setRangeToken(fgUniAssigned,tok); 00174 00175 // Create space range 00176 tok = tokFactory->createRange(); 00177 tok->mergeRanges(ranges[XMLUniCharacter::SPACE_SEPARATOR]); 00178 tok->mergeRanges(ranges[XMLUniCharacter::LINE_SEPARATOR]); 00179 //tok->mergeRanges(ranges[XMLUniCharacter::PARAGRAPH_SEPARATOR]); 00180 // build the internal map. 00181 tok->createMap(); 00182 rangeTokMap->setRangeToken(fgUniIsSpace, tok); 00183 00184 tok = RangeToken::complementRanges(tok, tokFactory); 00185 // build the internal map. 00186 tok->createMap(); 00187 rangeTokMap->setRangeToken(fgUniIsSpace, tok , true); 00188 00189 RangeToken* const dummyToken = 00190 tokFactory->createRange(); 00191 00192 dummyToken->addRange(-1, -2); 00193 dummyToken->createMap(); 00194 00195 // build the internal maps. 00196 for (int l=0; l < UNICATEGSIZE; l++) { 00197 ranges[l]->createMap(); 00198 ranges[l]->setCaseInsensitiveToken(dummyToken); 00199 } 00200 00201 fRangesCreated = true; 00202 } 00203 00204 // --------------------------------------------------------------------------- 00205 // UnicodeRangeFactory: Initialization methods 00206 // --------------------------------------------------------------------------- 00207 void UnicodeRangeFactory::initializeKeywordMap(RangeTokenMap *rangeTokMap) { 00208 00209 if (fKeywordsInitialized) 00210 return; 00211 00212 for (int k=0; k < UNICATEGSIZE; k++) { 00213 rangeTokMap->addKeywordMap(uniCategNames[k], fgUnicodeCategory); 00214 } 00215 00216 rangeTokMap->addKeywordMap(fgUniAll, fgUnicodeCategory); 00217 rangeTokMap->addKeywordMap(fgUniIsAlpha, fgUnicodeCategory); 00218 rangeTokMap->addKeywordMap(fgUniIsAlnum, fgUnicodeCategory); 00219 rangeTokMap->addKeywordMap(fgUniIsWord, fgUnicodeCategory); 00220 rangeTokMap->addKeywordMap(fgUniAssigned, fgUnicodeCategory); 00221 rangeTokMap->addKeywordMap(fgUniIsSpace, fgUnicodeCategory); 00222 00223 fKeywordsInitialized = true; 00224 } 00225 00226 // --------------------------------------------------------------------------- 00227 // UnicodeRangeFactory: Helper methods 00228 // --------------------------------------------------------------------------- 00229 unsigned short UnicodeRangeFactory::getUniCategory(const unsigned short type) 00230 { 00231 switch(type) { 00232 case XMLUniCharacter::UPPERCASE_LETTER: 00233 case XMLUniCharacter::LOWERCASE_LETTER: 00234 case XMLUniCharacter::TITLECASE_LETTER: 00235 case XMLUniCharacter::MODIFIER_LETTER: 00236 case XMLUniCharacter::OTHER_LETTER: 00237 return CHAR_LETTER; 00238 case XMLUniCharacter::NON_SPACING_MARK: 00239 case XMLUniCharacter::COMBINING_SPACING_MARK: 00240 case XMLUniCharacter::ENCLOSING_MARK: 00241 return CHAR_MARK; 00242 case XMLUniCharacter::DECIMAL_DIGIT_NUMBER: 00243 case XMLUniCharacter::LETTER_NUMBER: 00244 case XMLUniCharacter::OTHER_NUMBER: 00245 return CHAR_NUMBER; 00246 case XMLUniCharacter::SPACE_SEPARATOR: 00247 case XMLUniCharacter::LINE_SEPARATOR: 00248 case XMLUniCharacter::PARAGRAPH_SEPARATOR: 00249 return CHAR_SEPARATOR; 00250 case XMLUniCharacter::CONTROL: 00251 case XMLUniCharacter::FORMAT: 00252 case XMLUniCharacter::SURROGATE: 00253 case XMLUniCharacter::PRIVATE_USE: 00254 case XMLUniCharacter::UNASSIGNED: 00255 return CHAR_OTHER; 00256 case XMLUniCharacter::CONNECTOR_PUNCTUATION: 00257 case XMLUniCharacter::DASH_PUNCTUATION: 00258 case XMLUniCharacter::START_PUNCTUATION: 00259 case XMLUniCharacter::END_PUNCTUATION: 00260 case XMLUniCharacter::OTHER_PUNCTUATION: 00261 case XMLUniCharacter::INITIAL_PUNCTUATION: 00262 case XMLUniCharacter::FINAL_PUNCTUATION: 00263 return CHAR_PUNCTUATION; 00264 case XMLUniCharacter::MATH_SYMBOL: 00265 case XMLUniCharacter::CURRENCY_SYMBOL: 00266 case XMLUniCharacter::MODIFIER_SYMBOL: 00267 case XMLUniCharacter::OTHER_SYMBOL: 00268 return CHAR_SYMBOL; 00269 } 00270 00271 return 0; 00272 } 00273 00274 XERCES_CPP_NAMESPACE_END 00275