GME  13
RegxParser.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  * 
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  * 
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: RegxParser.cpp 834826 2009-11-11 10:03:53Z borisk $
00020  */
00021 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/util/regx/RegxParser.hpp>
00026 #include <xercesc/util/XMLString.hpp>
00027 #include <xercesc/util/ParseException.hpp>
00028 #include <xercesc/util/regx/RegularExpression.hpp>
00029 #include <xercesc/util/regx/RegxUtil.hpp>
00030 #include <xercesc/util/regx/RegxDefs.hpp>
00031 #include <xercesc/util/regx/TokenInc.hpp>
00032 #include <xercesc/framework/XMLErrorCodes.hpp>
00033 
00034 XERCES_CPP_NAMESPACE_BEGIN
00035 
00036 // ---------------------------------------------------------------------------
00037 //  RegxParser::ReferencePostion: Constructors and Destructor
00038 // ---------------------------------------------------------------------------
00039 RegxParser::ReferencePosition::ReferencePosition(const int refNo,
00040                                                  const XMLSize_t position)
00041     :fReferenceNo(refNo)
00042     , fPosition(position)
00043 {
00044 
00045 }
00046 
00047 // ---------------------------------------------------------------------------
00048 //  RegxParser: Constructors and Destructors
00049 // ---------------------------------------------------------------------------
00050 RegxParser::RegxParser(MemoryManager* const manager)
00051     :fMemoryManager(manager),
00052      fHasBackReferences(false),
00053      fOptions(0),
00054      fOffset(0),
00055      fNoGroups(1),
00056      fParseContext(regexParserStateNormal),
00057      fStringLen(0),
00058      fState(REGX_T_EOF),
00059      fCharData(0),
00060      fString(0),
00061      fReferences(0),
00062      fTokenFactory(0)
00063 {
00064 }
00065 
00066 RegxParser::~RegxParser() {
00067 
00068     fMemoryManager->deallocate(fString);//delete [] fString;
00069     delete fReferences;
00070 }
00071 
00072 // ---------------------------------------------------------------------------
00073 //  RegxParser: Parsing methods
00074 // ---------------------------------------------------------------------------
00075 Token* RegxParser::parse(const XMLCh* const regxStr, const int options) {
00076 
00077     // if TokenFactory is not set do nothing.
00078     // REVISIT - should we throw an exception
00079     if (fTokenFactory == 0) {
00080         return 0;
00081     }
00082 
00083     fOptions = options;
00084     fOffset = 0;
00085     fNoGroups = 1;
00086     fHasBackReferences = false;
00087     setParseContext(regexParserStateNormal);
00088     if (fString)
00089         fMemoryManager->deallocate(fString);//delete [] fString;
00090     fString = XMLString::replicate(regxStr, fMemoryManager);
00091 
00092     if (isSet(RegularExpression::EXTENDED_COMMENT)) {
00093 
00094         if (fString)
00095             fMemoryManager->deallocate(fString);//delete [] fString;
00096         fString = RegxUtil::stripExtendedComment(regxStr, fMemoryManager);
00097     }
00098 
00099     fStringLen = XMLString::stringLen(fString);
00100     processNext();
00101 
00102     Token* retTok = parseRegx();
00103 
00104     if (fOffset != fStringLen) {
00105         XMLCh value1[65];
00106         XMLString::sizeToText(fOffset, value1, 64, 10, fMemoryManager);
00107         ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Parse1, value1, fString, fMemoryManager);
00108     }
00109 
00110     if (fReferences != 0) {
00111 
00112         XMLSize_t refSize = fReferences->size();
00113         for (XMLSize_t i = 0; i < refSize; i++) {
00114 
00115             if (fNoGroups <= fReferences->elementAt(i)->fReferenceNo) {
00116                 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse2, fMemoryManager);
00117             }
00118         }
00119 
00120         fReferences->removeAllElements();
00121     }
00122 
00123     return retTok;
00124 }
00125 
00126 
00127 void RegxParser::processNext() {
00128 
00129     if (fOffset >= fStringLen) {
00130 
00131         fCharData = -1;
00132         fState = REGX_T_EOF;
00133         return;
00134     }
00135 
00136     parserState nextState;
00137     XMLCh ch = fString[fOffset++];
00138     fCharData = ch;
00139 
00140     if (fParseContext == regexParserStateInBrackets) {
00141 
00142         switch (ch) {
00143         case chBackSlash:
00144             nextState = REGX_T_BACKSOLIDUS;
00145 
00146             if (fOffset >= fStringLen) {
00147                 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);
00148             }
00149 
00150             fCharData = fString[fOffset++];
00151             break;
00152         case chDash:
00153             if (fOffset < fStringLen && fString[fOffset] == chOpenSquare) {
00154 
00155                 fOffset++;
00156                 nextState = REGX_T_XMLSCHEMA_CC_SUBTRACTION;
00157             }
00158             else {
00159                 nextState = REGX_T_CHAR;
00160             }
00161             break;
00162         default:
00163             if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {
00164 
00165                 XMLCh lowCh = fString[fOffset];
00166                 if (RegxUtil::isLowSurrogate(lowCh)) {
00167                     fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);
00168                     fOffset++;
00169                 }
00170                 else {
00171                     throw XMLErrs::Expected2ndSurrogateChar;
00172                 }
00173             }
00174 
00175             nextState = REGX_T_CHAR;
00176         }
00177 
00178         fState = nextState;
00179         return;
00180     }
00181 
00182     switch (ch) {
00183 
00184     case chPipe:
00185         nextState = REGX_T_OR;
00186         break;
00187     case chAsterisk:
00188         nextState = REGX_T_STAR;
00189         break;
00190     case chPlus:
00191         nextState = REGX_T_PLUS;
00192         break;
00193     case chQuestion:
00194         nextState = REGX_T_QUESTION;
00195         break;
00196     case chCloseParen:
00197         nextState = REGX_T_RPAREN;
00198         break;
00199     case chPeriod:
00200         nextState = REGX_T_DOT;
00201         break;
00202     case chOpenSquare:
00203         nextState = REGX_T_LBRACKET;
00204         break;
00205     case chCaret:
00206         nextState = REGX_T_CARET;
00207         break;
00208     case chDollarSign:
00209         nextState = REGX_T_DOLLAR;
00210         break;
00211     case chOpenParen:
00212         nextState = REGX_T_LPAREN;
00213         break;
00214     case chBackSlash:
00215         nextState = REGX_T_BACKSOLIDUS;
00216         if (fOffset >= fStringLen) {
00217             ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager);
00218         }
00219 
00220         fCharData = fString[fOffset++];
00221         break;
00222     default:
00223         nextState = REGX_T_CHAR;
00224         if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) {
00225 
00226                 XMLCh lowCh = fString[fOffset];
00227                 if (RegxUtil::isLowSurrogate(lowCh)) {
00228                     fCharData = RegxUtil::composeFromSurrogate(ch, lowCh);
00229                     fOffset++;
00230                 }
00231                 else {
00232                     throw XMLErrs::Expected2ndSurrogateChar;
00233                 }
00234             }
00235     }
00236 
00237     fState = nextState;
00238 }
00239 
00240 
00241 Token* RegxParser::parseRegx(const bool matchingRParen) {
00242 
00243     Token* tok = parseTerm(matchingRParen);
00244     Token* parentTok = 0;
00245 
00246     while (fState == REGX_T_OR) {
00247 
00248         processNext();
00249         if (parentTok == 0) {
00250 
00251             parentTok = fTokenFactory->createUnion();
00252             parentTok->addChild(tok, fTokenFactory);
00253             tok = parentTok;
00254         }
00255 
00256         tok->addChild(parseTerm(matchingRParen), fTokenFactory);
00257     }
00258 
00259     return tok;
00260 }
00261 
00262 
00263 Token* RegxParser::parseTerm(const bool matchingRParen) {
00264 
00265     parserState state = fState;
00266 
00267     if (state == REGX_T_OR || state == REGX_T_EOF
00268         || (state == REGX_T_RPAREN && matchingRParen)) {
00269         return fTokenFactory->createToken(Token::T_EMPTY);
00270     }
00271     else {
00272 
00273         Token* tok = parseFactor();
00274         Token* concatTok = 0;
00275 
00276         while ((state = fState) != REGX_T_OR && state != REGX_T_EOF
00277                && (state != REGX_T_RPAREN || !matchingRParen))
00278         {
00279             if (concatTok == 0) {
00280 
00281                 concatTok = fTokenFactory->createUnion(true);
00282                 concatTok->addChild(tok, fTokenFactory);
00283                 tok = concatTok;
00284             }
00285             concatTok->addChild(parseFactor(), fTokenFactory);
00286         }
00287 
00288         return tok;
00289     }
00290 }
00291 
00292 
00293 Token* RegxParser::processCaret() {
00294 
00295     processNext();
00296     return fTokenFactory->getLineBegin();
00297 }
00298 
00299 
00300 Token* RegxParser::processDollar() {
00301 
00302     processNext();
00303     return fTokenFactory->getLineEnd();
00304 }
00305 
00306 
00307 Token* RegxParser::processStar(Token* const tok) {
00308 
00309     processNext();
00310 
00311     if (fState == REGX_T_QUESTION) {
00312         processNext();
00313         return fTokenFactory->createClosure(tok, true);
00314     }
00315 
00316     return fTokenFactory->createClosure(tok);
00317 }
00318 
00319 
00320 Token* RegxParser::processPlus(Token* const tok) {
00321 
00322     processNext();
00323 
00324     if (fState == REGX_T_QUESTION) {
00325         processNext();
00326         return fTokenFactory->createConcat(tok,
00327                            fTokenFactory->createClosure(tok,true));
00328     }
00329 
00330     return fTokenFactory->createConcat(tok,
00331                                 fTokenFactory->createClosure(tok));
00332 }
00333 
00334 
00335 Token* RegxParser::processQuestion(Token* const tok) {
00336 
00337     processNext();
00338 
00339     Token* parentTok = fTokenFactory->createUnion();
00340 
00341     if (fState == REGX_T_QUESTION) {
00342         processNext();
00343         parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory);
00344         parentTok->addChild(tok, fTokenFactory);
00345     }
00346     else {
00347         parentTok->addChild(tok, fTokenFactory);
00348         parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory);
00349     }
00350 
00351     return parentTok;
00352 }
00353 
00354 
00355 Token* RegxParser::processParen() {
00356 
00357     processNext();
00358     int num = fNoGroups++;
00359     Token* tok = fTokenFactory->createParenthesis(parseRegx(true),num);
00360 
00361     if (fState != REGX_T_RPAREN)
00362         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager);
00363 
00364     processNext();
00365     return tok;
00366 }
00367 
00368 
00369 Token* RegxParser::processBackReference() {
00370 
00371     XMLSize_t position = fOffset - 2;
00372 
00373     // Handle multi digit back references
00374     int refNo = fCharData - chDigit_0;
00375     while(true) {
00376         processNext();
00377         if(fState != REGX_T_CHAR || fCharData < chDigit_0 || fCharData > chDigit_9)
00378             break;
00379 
00380         int nextRefNo = (refNo * 10) + fCharData - chDigit_0;
00381         if(nextRefNo >= fNoGroups)
00382             break;
00383 
00384         refNo = nextRefNo;
00385     }
00386 
00387     Token* tok = fTokenFactory->createBackReference(refNo);
00388 
00389     fHasBackReferences = true;
00390     if (fReferences == 0) {
00391         fReferences = new (fMemoryManager) RefVectorOf<ReferencePosition>(8, true, fMemoryManager);
00392     }
00393 
00394     fReferences->addElement(new (fMemoryManager) ReferencePosition(refNo, position));
00395     return tok;
00396 }
00397 
00398 
00399 Token* RegxParser::parseFactor() {
00400 
00401     Token* tok = parseAtom();
00402 
00403     switch(fState) {
00404 
00405     case REGX_T_STAR:
00406         return processStar(tok);
00407     case REGX_T_PLUS:
00408         return processPlus(tok);
00409     case REGX_T_QUESTION:
00410         return processQuestion(tok);
00411     case REGX_T_CHAR:
00412         if (fCharData == chOpenCurly && fOffset < fStringLen) {
00413 
00414             int min = 0;
00415             int max = -1;
00416             XMLInt32 ch = fString[fOffset++];
00417 
00418             if (ch >= chDigit_0 && ch <= chDigit_9) {
00419 
00420                 min = ch - chDigit_0;
00421                 while (fOffset < fStringLen
00422                        && (ch = fString[fOffset++]) >= chDigit_0
00423                        && ch <= chDigit_9) {
00424 
00425                     min = min*10 + ch - chDigit_0;
00426                 }
00427 
00428                 if (min < 0)
00429                     ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager);
00430             }
00431             else {
00432                 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier1, fString, fMemoryManager);
00433             }
00434 
00435             max = min;
00436 
00437             if (ch == chComma) {
00438 
00439                 if (fOffset >= fStringLen) {
00440                     ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier3, fString, fMemoryManager);
00441                 }
00442                 else if ((ch = fString[fOffset++]) >= chDigit_0 && ch <= chDigit_9) {
00443 
00444                     max = ch - chDigit_0;
00445                     while (fOffset < fStringLen
00446                            && (ch = fString[fOffset++]) >= chDigit_0
00447                            && ch <= chDigit_9) {
00448 
00449                         max = max*10 + ch - chDigit_0;
00450                     }
00451 
00452                     if (max < 0)
00453                         ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager);
00454                     else if (min > max)
00455                         ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier4, fString, fMemoryManager);
00456                 }
00457                 else {
00458                     max = -1;
00459                 }
00460             }
00461 
00462             if (ch != chCloseCurly)  {
00463                 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier2, fString, fMemoryManager);
00464             }
00465 
00466             if (checkQuestion(fOffset)) {
00467 
00468                 tok = fTokenFactory->createClosure(tok, true);
00469                 fOffset++;
00470             }
00471             else {
00472                 tok = fTokenFactory->createClosure(tok);
00473             }
00474 
00475             tok->setMin(min);
00476             tok->setMax(max);
00477             processNext();
00478         }
00479         break;
00480     default:
00481         break;
00482     }
00483 
00484     return tok;
00485 }
00486 
00487 
00488 Token* RegxParser::parseAtom() {
00489 
00490     Token* tok = 0;
00491 
00492     switch(fState) {
00493 
00494     case REGX_T_LPAREN:
00495         return processParen();
00496     case REGX_T_DOT:
00497         processNext();
00498         tok = fTokenFactory->getDot();
00499         break;
00500     case REGX_T_CARET:
00501         return processCaret();
00502     case REGX_T_DOLLAR:
00503         return processDollar();
00504     case REGX_T_LBRACKET:
00505         return parseCharacterClass(true);
00506     case REGX_T_BACKSOLIDUS:
00507         switch(fCharData) {
00508 
00509         case chLatin_d:
00510         case chLatin_D:
00511         case chLatin_w:
00512         case chLatin_W:
00513         case chLatin_s:
00514         case chLatin_S:
00515         case chLatin_c:
00516         case chLatin_C:
00517         case chLatin_i:
00518         case chLatin_I:
00519             tok = getTokenForShorthand(fCharData);
00520             processNext();
00521             return tok;
00522         case chDigit_0:
00523         case chDigit_1:
00524         case chDigit_2:
00525         case chDigit_3:
00526         case chDigit_4:
00527         case chDigit_5:
00528         case chDigit_6:
00529         case chDigit_7:
00530         case chDigit_8:
00531         case chDigit_9:
00532             return processBackReference();
00533         case chLatin_p:
00534         case chLatin_P:
00535             {                
00536                 tok = processBacksolidus_pP(fCharData);
00537                 if (tok == 0) {
00538                     ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager);
00539                 }
00540             }
00541             break;
00542         default:
00543             {
00544                 XMLInt32 ch = decodeEscaped();
00545                 if (ch < 0x10000) {
00546                     tok = fTokenFactory->createChar(ch);
00547                 }
00548                 else {
00549 
00550                     XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
00551                     ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager);
00552                     tok = fTokenFactory->createString(surrogateStr);
00553                 }
00554             }
00555             break;
00556         } // end switch
00557 
00558         processNext();
00559         break;
00560     case REGX_T_CHAR:
00561         if (fCharData == chOpenCurly
00562             || fCharData == chCloseCurly
00563             || fCharData == chCloseSquare)
00564             ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);
00565 
00566         tok = fTokenFactory->createChar(fCharData);
00567         processNext();
00568         break;
00569     default:
00570         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager);
00571     } //end switch
00572 
00573     return tok;
00574 }
00575 
00576 
00577 RangeToken* RegxParser::processBacksolidus_pP(const XMLInt32 ch) {
00578 
00579     processNext();
00580 
00581     if (fState != REGX_T_CHAR || fCharData != chOpenCurly)
00582         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom2, fMemoryManager);
00583 
00584     XMLSize_t nameStart = fOffset;
00585     int nameEnd = XMLString::indexOf(fString,chCloseCurly,nameStart, fMemoryManager);
00586 
00587     if (nameEnd < 0)
00588         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom3, fMemoryManager);
00589     
00590     fOffset = nameEnd + 1;
00591     XMLCh* rangeName = (XMLCh*) fMemoryManager->allocate
00592     (
00593         (nameEnd - nameStart + 1) * sizeof(XMLCh)
00594     );//new XMLCh[(nameEnd - nameStart) + 1];
00595     ArrayJanitor<XMLCh> janRangeName(rangeName, fMemoryManager);
00596     XMLString::subString(rangeName, fString, nameStart, nameEnd, fMemoryManager);
00597 
00598     return  fTokenFactory->getRange(rangeName, !(ch == chLatin_p));
00599 }
00600 
00601 RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {
00602 
00603     setParseContext(regexParserStateInBrackets);
00604     processNext();
00605 
00606     RangeToken* tok = 0;
00607     bool isNRange = false;
00608 
00609     if (getState() == REGX_T_CHAR && getCharData() == chCaret) {
00610         isNRange = true;
00611         processNext();
00612     }
00613     tok = fTokenFactory->createRange();
00614 
00615     parserState type;
00616     bool firstLoop = true;
00617     bool wasDecoded;
00618 
00619     while ( (type = getState()) != REGX_T_EOF) {
00620 
00621         wasDecoded = false;
00622 
00623         // single range | from-to-range | subtraction
00624         if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop)
00625             break;
00626 
00627         XMLInt32 ch = getCharData();
00628         bool     end = false;
00629 
00630         if (type == REGX_T_BACKSOLIDUS) {
00631 
00632             switch(ch) {
00633             case chLatin_d:
00634             case chLatin_D:
00635             case chLatin_w:
00636             case chLatin_W:
00637             case chLatin_s:
00638             case chLatin_S:
00639             case chLatin_i:
00640             case chLatin_I:
00641             case chLatin_c:
00642             case chLatin_C:
00643                 {
00644                     tok->mergeRanges(getTokenForShorthand(ch));
00645                     end = true;
00646                 }
00647                 break;
00648             case chLatin_p:
00649             case chLatin_P:
00650                 {                    
00651                     RangeToken* tok2 = processBacksolidus_pP(ch);
00652 
00653                     if (tok2 == 0) {
00654                         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager());
00655                     }
00656 
00657                     tok->mergeRanges(tok2);
00658                     end = true;
00659                 }
00660                 break;
00661             case chDash:
00662                 wasDecoded = true;
00663                 // fall thru to default.
00664             default:
00665                 ch = decodeEscaped();
00666             }
00667         } // end if REGX_T_BACKSOLIDUS
00668         else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) {
00669 
00670             if (isNRange)
00671             {
00672                 tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
00673                 isNRange=false;
00674             }
00675             RangeToken* rangeTok = parseCharacterClass(false);
00676             tok->subtractRanges(rangeTok);
00677 
00678             if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) {
00679                 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager());
00680             }
00681             break;
00682         } // end if REGX_T_XMLSCHEMA...
00683 
00684         processNext();
00685 
00686         if (!end) {
00687 
00688             if (type == REGX_T_CHAR
00689                 && (ch == chOpenSquare
00690                     || ch == chCloseSquare
00691                     || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) {
00692                 // if regex = [-] then invalid...
00693                 // '[', ']', '-' not allowed and should be escaped
00694                 XMLCh chStr[] = { ch, chNull };
00695                 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
00696             }
00697             if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) {
00698                 XMLCh chStr[] = { ch, chNull };
00699                 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager());
00700             }
00701 
00702             if (getState() != REGX_T_CHAR || getCharData() != chDash) {
00703                 tok->addRange(ch, ch);
00704             }
00705             else {
00706 
00707                 processNext();
00708                 if ((type = getState()) == REGX_T_EOF)
00709                     ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());
00710 
00711                 if (type == REGX_T_CHAR && getCharData() == chCloseSquare) {
00712                     tok->addRange(ch, ch);
00713                     tok->addRange(chDash, chDash);
00714                 }
00715                 else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) {
00716 
00717                     static const XMLCh dashStr[] = { chDash, chNull};
00718                     ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager());
00719                 }
00720                 else {
00721 
00722                     XMLInt32 rangeEnd = getCharData();
00723                     XMLCh rangeEndStr[] = { rangeEnd, chNull };
00724 
00725                     if (type == REGX_T_CHAR) {
00726 
00727                         if (rangeEnd == chOpenSquare
00728                             || rangeEnd == chCloseSquare
00729                             || rangeEnd == chDash)
00730                             // '[', ']', '-' not allowed and should be escaped
00731                             ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager());
00732                     }
00733                     else if (type == REGX_T_BACKSOLIDUS) {
00734                         rangeEnd = decodeEscaped();
00735                     }
00736 
00737                     processNext();
00738 
00739                     if (ch > rangeEnd) {
00740                         XMLCh chStr[] = { ch, chNull };
00741                         ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager());
00742                     }
00743 
00744                     tok->addRange(ch, rangeEnd);
00745                 }
00746             }
00747         }
00748         firstLoop = false;
00749     }
00750 
00751     if (getState() == REGX_T_EOF)
00752         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager());
00753 
00754     if (isNRange)
00755     {
00756         if(useNRange)
00757             tok->setTokenType(Token::T_NRANGE);
00758         else
00759             tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager);
00760     }
00761 
00762     tok->sortRanges();
00763     tok->compactRanges();
00764 
00765     // If the case-insensitive option is enabled, we need to
00766     // have the new RangeToken instance build its internal
00767     // case-insensitive RangeToken.
00768     if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
00769     {
00770         tok->getCaseInsensitiveToken(fTokenFactory);
00771     }
00772 
00773     setParseContext(regexParserStateNormal);
00774     processNext();
00775 
00776     return tok;
00777 }
00778 
00779 
00780 RangeToken* RegxParser::getTokenForShorthand(const XMLInt32 ch) {
00781 
00782     switch(ch) {
00783     case chLatin_d:
00784         return fTokenFactory->getRange(fgUniDecimalDigit);
00785         //return fTokenFactory->getRange(fgXMLDigit);
00786     case chLatin_D:
00787         return fTokenFactory->getRange(fgUniDecimalDigit, true);
00788         //return fTokenFactory->getRange(fgXMLDigit, true);
00789     case chLatin_w:
00790         return fTokenFactory->getRange(fgXMLWord);
00791     case chLatin_W:
00792         return fTokenFactory->getRange(fgXMLWord, true);
00793     case chLatin_s:
00794         return fTokenFactory->getRange(fgXMLSpace);
00795     case chLatin_S:
00796         return fTokenFactory->getRange(fgXMLSpace, true);
00797     case chLatin_c:
00798         return fTokenFactory->getRange(fgXMLNameChar);
00799     case chLatin_C:
00800         return fTokenFactory->getRange(fgXMLNameChar, true);
00801     case chLatin_i:
00802         return fTokenFactory->getRange(fgXMLInitialNameChar);
00803     case chLatin_I:
00804         return fTokenFactory->getRange(fgXMLInitialNameChar, true);
00805 //    default:
00806 //        ThrowXMLwithMemMgr(RuntimeException, "Invalid shorthand {0}", chAsString)
00807     }
00808 
00809     return 0;
00810 }
00811 
00812 
00813 XMLInt32 RegxParser::decodeEscaped() {
00814 
00815     if (fState != REGX_T_BACKSOLIDUS)
00816         ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, getMemoryManager());
00817 
00818     XMLInt32 ch = fCharData;
00819 
00820     switch (ch) {
00821     case chLatin_n:
00822         ch = chLF;
00823         break;
00824     case chLatin_r:
00825         ch = chCR;
00826         break;
00827     case chLatin_t:
00828         ch = chHTab;
00829         break;
00830     case chBackSlash:
00831     case chPipe:
00832     case chPeriod:
00833     case chCaret:
00834     case chDash:
00835     case chQuestion:
00836     case chAsterisk:
00837     case chPlus:
00838     case chOpenCurly:
00839     case chCloseCurly:
00840     case chOpenParen:
00841     case chCloseParen:
00842     case chOpenSquare:
00843     case chCloseSquare:
00844     case chDollarSign:
00845         break;
00846     default:
00847     {
00848         XMLCh chString[] = {chBackSlash, ch, chNull};        
00849         ThrowXMLwithMemMgr1(ParseException,XMLExcepts::Parser_Process2, chString, getMemoryManager());
00850     }
00851     }
00852 
00853     return ch;
00854 }
00855 
00856 // ---------------------------------------------------------------------------
00857 //  RegxParser: Helper Methods
00858 // ---------------------------------------------------------------------------
00859 bool RegxParser::checkQuestion(const XMLSize_t off) {
00860 
00861     return ((off < fStringLen) && fString[off] == chQuestion);
00862 }
00863 
00864 XERCES_CPP_NAMESPACE_END
00865