GME  13
Token.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: Token.cpp 678879 2008-07-22 20:05:05Z amassari $
00020  */
00021 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/util/regx/RangeToken.hpp>
00026 #include <xercesc/util/regx/RegularExpression.hpp>
00027 #include <xercesc/util/regx/RegxUtil.hpp>
00028 
00029 XERCES_CPP_NAMESPACE_BEGIN
00030 
00031 // ---------------------------------------------------------------------------
00032 //  Static member data initialization
00033 // ---------------------------------------------------------------------------
00034 const XMLInt32         Token::UTF16_MAX = 0x10FFFF;
00035 
00036 // ---------------------------------------------------------------------------
00037 //  Token: Constructors and Destructors
00038 // ---------------------------------------------------------------------------
00039 Token::Token(const Token::tokType tkType
00040              , MemoryManager* const manager) 
00041              : fTokenType(tkType) 
00042              , fMemoryManager(manager)
00043 {
00044 
00045 }
00046 
00047 
00048 Token::~Token() {
00049 
00050 }
00051 
00052 // ---------------------------------------------------------------------------
00053 //  Token: Getter mthods
00054 // ---------------------------------------------------------------------------
00055 XMLSize_t Token::getMinLength() const {
00056 
00057     switch (fTokenType) {
00058 
00059     case T_CONCAT:
00060         {
00061             XMLSize_t sum = 0;
00062             XMLSize_t childSize = size();
00063 
00064             for (XMLSize_t i=0; i<childSize; i++) {
00065                 sum += getChild(i)->getMinLength();
00066             }
00067             return sum;
00068         }
00069     case T_UNION:
00070         {
00071             XMLSize_t childSize = size();
00072 
00073             if (childSize == 0) {
00074                 return 0;
00075             }
00076             XMLSize_t ret = getChild(0)->getMinLength();
00077 
00078             for (XMLSize_t i=1; i < childSize; i++) {
00079 
00080                 XMLSize_t min = getChild(i)->getMinLength();
00081                 if (min < ret)
00082                     ret = min;
00083             }
00084             return ret;
00085         }
00086     case T_CLOSURE:
00087     case T_NONGREEDYCLOSURE:
00088         if (getMin() >= 0)
00089             return getMin() * getChild(0)->getMinLength();
00090 
00091         return 0;
00092     case T_EMPTY:
00093     case T_ANCHOR:
00094         return 0;
00095     case T_DOT:
00096     case T_CHAR:
00097     case T_RANGE:
00098     case T_NRANGE:
00099         return 1;
00100     case T_PAREN:
00101         return getChild(0)->getMinLength();
00102     case T_BACKREFERENCE:
00103         return 0; // *****  - REVISIT
00104     case T_STRING:
00105         return XMLString::stringLen(getString());
00106 //    default:
00107 //        throw;
00108     }
00109 
00110     // We should not get here, but we have it to make some compilers happy
00111     return (XMLSize_t)-1;
00112 }
00113 
00114 
00115 int Token::getMaxLength() const {
00116 
00117     switch (fTokenType) {
00118 
00119     case T_CONCAT:
00120         {
00121             int sum = 0;
00122             XMLSize_t childSize = size();
00123 
00124             for (XMLSize_t i=0; i<childSize; i++) {
00125 
00126                 int val = getChild(i)->getMaxLength();
00127 
00128                 if (val < 0){
00129                     return -1;
00130                 }
00131                 sum += val;
00132             }
00133             return sum;
00134         }
00135     case T_UNION:
00136         {
00137             XMLSize_t childSize = size();
00138 
00139             if (childSize == 0)
00140                 return 0;
00141 
00142             int ret = getChild(0)->getMaxLength();
00143 
00144             for (XMLSize_t i = 1; ret > 0 && i < childSize; i++) {
00145 
00146                 int max = getChild(i)->getMaxLength();
00147 
00148                 if (max < 0) {
00149 
00150                     ret = -1;
00151                     break;
00152                 }
00153 
00154                 if (max > ret)
00155                     ret = max;
00156             }
00157             return ret;
00158         }
00159     case T_CLOSURE:
00160     case T_NONGREEDYCLOSURE:
00161         if (getMax() >= 0) {
00162             return getMax() * getChild(0)->getMaxLength();
00163         }
00164         return -1;
00165     case T_EMPTY:
00166     case T_ANCHOR:
00167         return 0;
00168     case T_CHAR:
00169         return 1;
00170     case T_DOT:
00171     case T_RANGE:
00172     case T_NRANGE:
00173         return 2;
00174     case T_PAREN:
00175         return getChild(0)->getMaxLength();
00176     case T_BACKREFERENCE:
00177         return -1; // REVISIT
00178     case T_STRING:
00179         return (int)XMLString::stringLen(getString());
00180 //    default:
00181 //        throw; //ThrowXML(RuntimeException, ...)
00182     } // end switch
00183 
00184     return -1;
00185 }
00186 
00187 // ---------------------------------------------------------------------------
00188 //  Token: Helper mthods
00189 // ---------------------------------------------------------------------------
00190 Token::firstCharacterOptions Token::analyzeFirstCharacter(RangeToken* const rangeTok,
00191                                                           const int options,
00192                                                           TokenFactory* const tokFactory)
00193 {
00194     switch(fTokenType) {
00195     case T_CONCAT:
00196         {
00197             firstCharacterOptions ret = FC_CONTINUE;
00198             for (XMLSize_t i=0; i<size(); i++) {
00199 
00200                 Token* tok = getChild(i);
00201                 if (tok
00202                     && (ret=tok->analyzeFirstCharacter(rangeTok,
00203                                     options, tokFactory))!= FC_CONTINUE)
00204                     break;
00205             }
00206             return ret;
00207         }
00208     case T_UNION:
00209         {
00210             XMLSize_t childSize = size();
00211             if (childSize == 0)
00212                 return FC_CONTINUE;
00213 
00214             firstCharacterOptions ret = FC_CONTINUE;
00215             bool hasEmpty = false;
00216 
00217             for (XMLSize_t i=0; i < childSize; i++) {
00218 
00219                 ret = getChild(i)->analyzeFirstCharacter(rangeTok, options, tokFactory);
00220 
00221                 if (ret == FC_ANY)
00222                     break;
00223                 else
00224                     hasEmpty = true;
00225             }
00226             return hasEmpty ? FC_CONTINUE : ret;
00227         }
00228     case T_CLOSURE:
00229     case T_NONGREEDYCLOSURE:
00230         {
00231             Token* tok = getChild(0);
00232             if (tok)
00233                 tok->analyzeFirstCharacter(rangeTok, options, tokFactory);
00234             return FC_CONTINUE;
00235         }
00236     case T_DOT:
00237     return FC_ANY;
00238     case T_EMPTY:
00239     case T_ANCHOR:
00240         return FC_CONTINUE;
00241     case T_CHAR:
00242         {
00243             XMLInt32 ch = getChar();
00244             rangeTok->addRange(ch, ch);
00245             if (ch < 0x1000 && isSet(options,RegularExpression::IGNORE_CASE)) {
00246                 //REVISIT
00247             }
00248         }
00249         return FC_TERMINAL;
00250     case T_RANGE:
00251         {
00252             if (isSet(options, RegularExpression::IGNORE_CASE)) {
00253                 rangeTok->mergeRanges(((RangeToken*)
00254                                          this)->getCaseInsensitiveToken(tokFactory));
00255             }
00256             else {
00257                 rangeTok->mergeRanges(this);
00258             }
00259             return FC_TERMINAL;
00260         }
00261     case T_NRANGE:
00262         {
00263             if (isSet(options, RegularExpression::IGNORE_CASE)) {
00264 
00265                 RangeToken* caseITok = (((RangeToken*)
00266                                            this)->getCaseInsensitiveToken(tokFactory));
00267                 rangeTok->mergeRanges(RangeToken::complementRanges(caseITok, tokFactory, fMemoryManager));
00268             }
00269             else {
00270                 rangeTok->mergeRanges(
00271                     RangeToken::complementRanges((RangeToken*) this, tokFactory, fMemoryManager));
00272             }
00273         }
00274     case T_PAREN:
00275         {
00276             Token* tok = getChild(0);
00277             if (tok)
00278                 return tok->analyzeFirstCharacter(rangeTok,options, tokFactory);
00279         }
00280     case T_BACKREFERENCE:
00281         rangeTok->addRange(0, UTF16_MAX);
00282         return FC_ANY;
00283     case T_STRING:
00284         {
00285             const XMLCh* str = getString();
00286             XMLInt32 ch = str[0];
00287 
00288             if (RegxUtil::isHighSurrogate((XMLCh) ch)) {
00289             }
00290 
00291             rangeTok->addRange(ch, ch);
00292             if (ch<0x10000 && isSet(options,RegularExpression::IGNORE_CASE)) {
00293                 //REVISIT
00294             }
00295         }
00296         return FC_TERMINAL;
00297 //    default:
00298 //        throw;
00299     }
00300 
00301     return FC_CONTINUE;
00302 }
00303 
00304 
00305 Token* Token::findFixedString(int options, int& outOptions) {
00306 
00307     switch(fTokenType) {
00308 
00309     case T_CHAR:
00310         return 0;
00311     case T_STRING:
00312         outOptions = options;
00313         return this;
00314     case T_UNION:
00315     case T_CLOSURE:
00316     case T_NONGREEDYCLOSURE:
00317     case T_EMPTY:
00318     case T_ANCHOR:
00319     case T_RANGE:
00320     case T_NRANGE:
00321     case T_DOT:
00322     case T_BACKREFERENCE:
00323         return 0;
00324     case T_PAREN:
00325         return getChild(0)->findFixedString(options, outOptions);
00326     case T_CONCAT:
00327         {
00328             Token* prevTok = 0;
00329             int prevOptions = 0;
00330 
00331             for (XMLSize_t i=0; i<size(); i++) {
00332 
00333                 Token* tok = getChild(i)->findFixedString(options, outOptions);
00334 
00335                 if (prevTok == 0 || prevTok->isShorterThan(tok)) {
00336 
00337                     prevTok = tok;
00338                     prevOptions = outOptions;
00339                 }
00340             }
00341 
00342             outOptions = prevOptions;
00343             return prevTok;
00344         }
00345     } // end switch
00346 
00347     return 0;
00348 }
00349 
00350 
00351 bool Token::isShorterThan(Token* const tok) {
00352 
00353     if (tok == 0)
00354         return false;
00355 
00356     if (getTokenType() != T_STRING && tok->getTokenType() != T_STRING)
00357         return false; //Should we throw an exception?
00358 
00359     XMLSize_t length = XMLString::stringLen(getString());
00360     XMLSize_t tokLength = XMLString::stringLen(tok->getString());
00361 
00362     return length < tokLength;
00363 }
00364 
00365 XERCES_CPP_NAMESPACE_END
00366