GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: Token.cpp 678879 2008-07-22 20:05:05Z amassari $ 00020 */ 00021 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/util/regx/RangeToken.hpp> 00026 #include <xercesc/util/regx/RegularExpression.hpp> 00027 #include <xercesc/util/regx/RegxUtil.hpp> 00028 00029 XERCES_CPP_NAMESPACE_BEGIN 00030 00031 // --------------------------------------------------------------------------- 00032 // Static member data initialization 00033 // --------------------------------------------------------------------------- 00034 const XMLInt32 Token::UTF16_MAX = 0x10FFFF; 00035 00036 // --------------------------------------------------------------------------- 00037 // Token: Constructors and Destructors 00038 // --------------------------------------------------------------------------- 00039 Token::Token(const Token::tokType tkType 00040 , MemoryManager* const manager) 00041 : fTokenType(tkType) 00042 , fMemoryManager(manager) 00043 { 00044 00045 } 00046 00047 00048 Token::~Token() { 00049 00050 } 00051 00052 // --------------------------------------------------------------------------- 00053 // Token: Getter mthods 00054 // --------------------------------------------------------------------------- 00055 XMLSize_t Token::getMinLength() const { 00056 00057 switch (fTokenType) { 00058 00059 case T_CONCAT: 00060 { 00061 XMLSize_t sum = 0; 00062 XMLSize_t childSize = size(); 00063 00064 for (XMLSize_t i=0; i<childSize; i++) { 00065 sum += getChild(i)->getMinLength(); 00066 } 00067 return sum; 00068 } 00069 case T_UNION: 00070 { 00071 XMLSize_t childSize = size(); 00072 00073 if (childSize == 0) { 00074 return 0; 00075 } 00076 XMLSize_t ret = getChild(0)->getMinLength(); 00077 00078 for (XMLSize_t i=1; i < childSize; i++) { 00079 00080 XMLSize_t min = getChild(i)->getMinLength(); 00081 if (min < ret) 00082 ret = min; 00083 } 00084 return ret; 00085 } 00086 case T_CLOSURE: 00087 case T_NONGREEDYCLOSURE: 00088 if (getMin() >= 0) 00089 return getMin() * getChild(0)->getMinLength(); 00090 00091 return 0; 00092 case T_EMPTY: 00093 case T_ANCHOR: 00094 return 0; 00095 case T_DOT: 00096 case T_CHAR: 00097 case T_RANGE: 00098 case T_NRANGE: 00099 return 1; 00100 case T_PAREN: 00101 return getChild(0)->getMinLength(); 00102 case T_BACKREFERENCE: 00103 return 0; // ***** - REVISIT 00104 case T_STRING: 00105 return XMLString::stringLen(getString()); 00106 // default: 00107 // throw; 00108 } 00109 00110 // We should not get here, but we have it to make some compilers happy 00111 return (XMLSize_t)-1; 00112 } 00113 00114 00115 int Token::getMaxLength() const { 00116 00117 switch (fTokenType) { 00118 00119 case T_CONCAT: 00120 { 00121 int sum = 0; 00122 XMLSize_t childSize = size(); 00123 00124 for (XMLSize_t i=0; i<childSize; i++) { 00125 00126 int val = getChild(i)->getMaxLength(); 00127 00128 if (val < 0){ 00129 return -1; 00130 } 00131 sum += val; 00132 } 00133 return sum; 00134 } 00135 case T_UNION: 00136 { 00137 XMLSize_t childSize = size(); 00138 00139 if (childSize == 0) 00140 return 0; 00141 00142 int ret = getChild(0)->getMaxLength(); 00143 00144 for (XMLSize_t i = 1; ret > 0 && i < childSize; i++) { 00145 00146 int max = getChild(i)->getMaxLength(); 00147 00148 if (max < 0) { 00149 00150 ret = -1; 00151 break; 00152 } 00153 00154 if (max > ret) 00155 ret = max; 00156 } 00157 return ret; 00158 } 00159 case T_CLOSURE: 00160 case T_NONGREEDYCLOSURE: 00161 if (getMax() >= 0) { 00162 return getMax() * getChild(0)->getMaxLength(); 00163 } 00164 return -1; 00165 case T_EMPTY: 00166 case T_ANCHOR: 00167 return 0; 00168 case T_CHAR: 00169 return 1; 00170 case T_DOT: 00171 case T_RANGE: 00172 case T_NRANGE: 00173 return 2; 00174 case T_PAREN: 00175 return getChild(0)->getMaxLength(); 00176 case T_BACKREFERENCE: 00177 return -1; // REVISIT 00178 case T_STRING: 00179 return (int)XMLString::stringLen(getString()); 00180 // default: 00181 // throw; //ThrowXML(RuntimeException, ...) 00182 } // end switch 00183 00184 return -1; 00185 } 00186 00187 // --------------------------------------------------------------------------- 00188 // Token: Helper mthods 00189 // --------------------------------------------------------------------------- 00190 Token::firstCharacterOptions Token::analyzeFirstCharacter(RangeToken* const rangeTok, 00191 const int options, 00192 TokenFactory* const tokFactory) 00193 { 00194 switch(fTokenType) { 00195 case T_CONCAT: 00196 { 00197 firstCharacterOptions ret = FC_CONTINUE; 00198 for (XMLSize_t i=0; i<size(); i++) { 00199 00200 Token* tok = getChild(i); 00201 if (tok 00202 && (ret=tok->analyzeFirstCharacter(rangeTok, 00203 options, tokFactory))!= FC_CONTINUE) 00204 break; 00205 } 00206 return ret; 00207 } 00208 case T_UNION: 00209 { 00210 XMLSize_t childSize = size(); 00211 if (childSize == 0) 00212 return FC_CONTINUE; 00213 00214 firstCharacterOptions ret = FC_CONTINUE; 00215 bool hasEmpty = false; 00216 00217 for (XMLSize_t i=0; i < childSize; i++) { 00218 00219 ret = getChild(i)->analyzeFirstCharacter(rangeTok, options, tokFactory); 00220 00221 if (ret == FC_ANY) 00222 break; 00223 else 00224 hasEmpty = true; 00225 } 00226 return hasEmpty ? FC_CONTINUE : ret; 00227 } 00228 case T_CLOSURE: 00229 case T_NONGREEDYCLOSURE: 00230 { 00231 Token* tok = getChild(0); 00232 if (tok) 00233 tok->analyzeFirstCharacter(rangeTok, options, tokFactory); 00234 return FC_CONTINUE; 00235 } 00236 case T_DOT: 00237 return FC_ANY; 00238 case T_EMPTY: 00239 case T_ANCHOR: 00240 return FC_CONTINUE; 00241 case T_CHAR: 00242 { 00243 XMLInt32 ch = getChar(); 00244 rangeTok->addRange(ch, ch); 00245 if (ch < 0x1000 && isSet(options,RegularExpression::IGNORE_CASE)) { 00246 //REVISIT 00247 } 00248 } 00249 return FC_TERMINAL; 00250 case T_RANGE: 00251 { 00252 if (isSet(options, RegularExpression::IGNORE_CASE)) { 00253 rangeTok->mergeRanges(((RangeToken*) 00254 this)->getCaseInsensitiveToken(tokFactory)); 00255 } 00256 else { 00257 rangeTok->mergeRanges(this); 00258 } 00259 return FC_TERMINAL; 00260 } 00261 case T_NRANGE: 00262 { 00263 if (isSet(options, RegularExpression::IGNORE_CASE)) { 00264 00265 RangeToken* caseITok = (((RangeToken*) 00266 this)->getCaseInsensitiveToken(tokFactory)); 00267 rangeTok->mergeRanges(RangeToken::complementRanges(caseITok, tokFactory, fMemoryManager)); 00268 } 00269 else { 00270 rangeTok->mergeRanges( 00271 RangeToken::complementRanges((RangeToken*) this, tokFactory, fMemoryManager)); 00272 } 00273 } 00274 case T_PAREN: 00275 { 00276 Token* tok = getChild(0); 00277 if (tok) 00278 return tok->analyzeFirstCharacter(rangeTok,options, tokFactory); 00279 } 00280 case T_BACKREFERENCE: 00281 rangeTok->addRange(0, UTF16_MAX); 00282 return FC_ANY; 00283 case T_STRING: 00284 { 00285 const XMLCh* str = getString(); 00286 XMLInt32 ch = str[0]; 00287 00288 if (RegxUtil::isHighSurrogate((XMLCh) ch)) { 00289 } 00290 00291 rangeTok->addRange(ch, ch); 00292 if (ch<0x10000 && isSet(options,RegularExpression::IGNORE_CASE)) { 00293 //REVISIT 00294 } 00295 } 00296 return FC_TERMINAL; 00297 // default: 00298 // throw; 00299 } 00300 00301 return FC_CONTINUE; 00302 } 00303 00304 00305 Token* Token::findFixedString(int options, int& outOptions) { 00306 00307 switch(fTokenType) { 00308 00309 case T_CHAR: 00310 return 0; 00311 case T_STRING: 00312 outOptions = options; 00313 return this; 00314 case T_UNION: 00315 case T_CLOSURE: 00316 case T_NONGREEDYCLOSURE: 00317 case T_EMPTY: 00318 case T_ANCHOR: 00319 case T_RANGE: 00320 case T_NRANGE: 00321 case T_DOT: 00322 case T_BACKREFERENCE: 00323 return 0; 00324 case T_PAREN: 00325 return getChild(0)->findFixedString(options, outOptions); 00326 case T_CONCAT: 00327 { 00328 Token* prevTok = 0; 00329 int prevOptions = 0; 00330 00331 for (XMLSize_t i=0; i<size(); i++) { 00332 00333 Token* tok = getChild(i)->findFixedString(options, outOptions); 00334 00335 if (prevTok == 0 || prevTok->isShorterThan(tok)) { 00336 00337 prevTok = tok; 00338 prevOptions = outOptions; 00339 } 00340 } 00341 00342 outOptions = prevOptions; 00343 return prevTok; 00344 } 00345 } // end switch 00346 00347 return 0; 00348 } 00349 00350 00351 bool Token::isShorterThan(Token* const tok) { 00352 00353 if (tok == 0) 00354 return false; 00355 00356 if (getTokenType() != T_STRING && tok->getTokenType() != T_STRING) 00357 return false; //Should we throw an exception? 00358 00359 XMLSize_t length = XMLString::stringLen(getString()); 00360 XMLSize_t tokLength = XMLString::stringLen(tok->getString()); 00361 00362 return length < tokLength; 00363 } 00364 00365 XERCES_CPP_NAMESPACE_END 00366