GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: RegxParser.cpp 834826 2009-11-11 10:03:53Z borisk $ 00020 */ 00021 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/util/regx/RegxParser.hpp> 00026 #include <xercesc/util/XMLString.hpp> 00027 #include <xercesc/util/ParseException.hpp> 00028 #include <xercesc/util/regx/RegularExpression.hpp> 00029 #include <xercesc/util/regx/RegxUtil.hpp> 00030 #include <xercesc/util/regx/RegxDefs.hpp> 00031 #include <xercesc/util/regx/TokenInc.hpp> 00032 #include <xercesc/framework/XMLErrorCodes.hpp> 00033 00034 XERCES_CPP_NAMESPACE_BEGIN 00035 00036 // --------------------------------------------------------------------------- 00037 // RegxParser::ReferencePostion: Constructors and Destructor 00038 // --------------------------------------------------------------------------- 00039 RegxParser::ReferencePosition::ReferencePosition(const int refNo, 00040 const XMLSize_t position) 00041 :fReferenceNo(refNo) 00042 , fPosition(position) 00043 { 00044 00045 } 00046 00047 // --------------------------------------------------------------------------- 00048 // RegxParser: Constructors and Destructors 00049 // --------------------------------------------------------------------------- 00050 RegxParser::RegxParser(MemoryManager* const manager) 00051 :fMemoryManager(manager), 00052 fHasBackReferences(false), 00053 fOptions(0), 00054 fOffset(0), 00055 fNoGroups(1), 00056 fParseContext(regexParserStateNormal), 00057 fStringLen(0), 00058 fState(REGX_T_EOF), 00059 fCharData(0), 00060 fString(0), 00061 fReferences(0), 00062 fTokenFactory(0) 00063 { 00064 } 00065 00066 RegxParser::~RegxParser() { 00067 00068 fMemoryManager->deallocate(fString);//delete [] fString; 00069 delete fReferences; 00070 } 00071 00072 // --------------------------------------------------------------------------- 00073 // RegxParser: Parsing methods 00074 // --------------------------------------------------------------------------- 00075 Token* RegxParser::parse(const XMLCh* const regxStr, const int options) { 00076 00077 // if TokenFactory is not set do nothing. 00078 // REVISIT - should we throw an exception 00079 if (fTokenFactory == 0) { 00080 return 0; 00081 } 00082 00083 fOptions = options; 00084 fOffset = 0; 00085 fNoGroups = 1; 00086 fHasBackReferences = false; 00087 setParseContext(regexParserStateNormal); 00088 if (fString) 00089 fMemoryManager->deallocate(fString);//delete [] fString; 00090 fString = XMLString::replicate(regxStr, fMemoryManager); 00091 00092 if (isSet(RegularExpression::EXTENDED_COMMENT)) { 00093 00094 if (fString) 00095 fMemoryManager->deallocate(fString);//delete [] fString; 00096 fString = RegxUtil::stripExtendedComment(regxStr, fMemoryManager); 00097 } 00098 00099 fStringLen = XMLString::stringLen(fString); 00100 processNext(); 00101 00102 Token* retTok = parseRegx(); 00103 00104 if (fOffset != fStringLen) { 00105 XMLCh value1[65]; 00106 XMLString::sizeToText(fOffset, value1, 64, 10, fMemoryManager); 00107 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Parse1, value1, fString, fMemoryManager); 00108 } 00109 00110 if (fReferences != 0) { 00111 00112 XMLSize_t refSize = fReferences->size(); 00113 for (XMLSize_t i = 0; i < refSize; i++) { 00114 00115 if (fNoGroups <= fReferences->elementAt(i)->fReferenceNo) { 00116 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Parse2, fMemoryManager); 00117 } 00118 } 00119 00120 fReferences->removeAllElements(); 00121 } 00122 00123 return retTok; 00124 } 00125 00126 00127 void RegxParser::processNext() { 00128 00129 if (fOffset >= fStringLen) { 00130 00131 fCharData = -1; 00132 fState = REGX_T_EOF; 00133 return; 00134 } 00135 00136 parserState nextState; 00137 XMLCh ch = fString[fOffset++]; 00138 fCharData = ch; 00139 00140 if (fParseContext == regexParserStateInBrackets) { 00141 00142 switch (ch) { 00143 case chBackSlash: 00144 nextState = REGX_T_BACKSOLIDUS; 00145 00146 if (fOffset >= fStringLen) { 00147 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager); 00148 } 00149 00150 fCharData = fString[fOffset++]; 00151 break; 00152 case chDash: 00153 if (fOffset < fStringLen && fString[fOffset] == chOpenSquare) { 00154 00155 fOffset++; 00156 nextState = REGX_T_XMLSCHEMA_CC_SUBTRACTION; 00157 } 00158 else { 00159 nextState = REGX_T_CHAR; 00160 } 00161 break; 00162 default: 00163 if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) { 00164 00165 XMLCh lowCh = fString[fOffset]; 00166 if (RegxUtil::isLowSurrogate(lowCh)) { 00167 fCharData = RegxUtil::composeFromSurrogate(ch, lowCh); 00168 fOffset++; 00169 } 00170 else { 00171 throw XMLErrs::Expected2ndSurrogateChar; 00172 } 00173 } 00174 00175 nextState = REGX_T_CHAR; 00176 } 00177 00178 fState = nextState; 00179 return; 00180 } 00181 00182 switch (ch) { 00183 00184 case chPipe: 00185 nextState = REGX_T_OR; 00186 break; 00187 case chAsterisk: 00188 nextState = REGX_T_STAR; 00189 break; 00190 case chPlus: 00191 nextState = REGX_T_PLUS; 00192 break; 00193 case chQuestion: 00194 nextState = REGX_T_QUESTION; 00195 break; 00196 case chCloseParen: 00197 nextState = REGX_T_RPAREN; 00198 break; 00199 case chPeriod: 00200 nextState = REGX_T_DOT; 00201 break; 00202 case chOpenSquare: 00203 nextState = REGX_T_LBRACKET; 00204 break; 00205 case chCaret: 00206 nextState = REGX_T_CARET; 00207 break; 00208 case chDollarSign: 00209 nextState = REGX_T_DOLLAR; 00210 break; 00211 case chOpenParen: 00212 nextState = REGX_T_LPAREN; 00213 break; 00214 case chBackSlash: 00215 nextState = REGX_T_BACKSOLIDUS; 00216 if (fOffset >= fStringLen) { 00217 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, fMemoryManager); 00218 } 00219 00220 fCharData = fString[fOffset++]; 00221 break; 00222 default: 00223 nextState = REGX_T_CHAR; 00224 if (RegxUtil::isHighSurrogate(ch) && fOffset < fStringLen) { 00225 00226 XMLCh lowCh = fString[fOffset]; 00227 if (RegxUtil::isLowSurrogate(lowCh)) { 00228 fCharData = RegxUtil::composeFromSurrogate(ch, lowCh); 00229 fOffset++; 00230 } 00231 else { 00232 throw XMLErrs::Expected2ndSurrogateChar; 00233 } 00234 } 00235 } 00236 00237 fState = nextState; 00238 } 00239 00240 00241 Token* RegxParser::parseRegx(const bool matchingRParen) { 00242 00243 Token* tok = parseTerm(matchingRParen); 00244 Token* parentTok = 0; 00245 00246 while (fState == REGX_T_OR) { 00247 00248 processNext(); 00249 if (parentTok == 0) { 00250 00251 parentTok = fTokenFactory->createUnion(); 00252 parentTok->addChild(tok, fTokenFactory); 00253 tok = parentTok; 00254 } 00255 00256 tok->addChild(parseTerm(matchingRParen), fTokenFactory); 00257 } 00258 00259 return tok; 00260 } 00261 00262 00263 Token* RegxParser::parseTerm(const bool matchingRParen) { 00264 00265 parserState state = fState; 00266 00267 if (state == REGX_T_OR || state == REGX_T_EOF 00268 || (state == REGX_T_RPAREN && matchingRParen)) { 00269 return fTokenFactory->createToken(Token::T_EMPTY); 00270 } 00271 else { 00272 00273 Token* tok = parseFactor(); 00274 Token* concatTok = 0; 00275 00276 while ((state = fState) != REGX_T_OR && state != REGX_T_EOF 00277 && (state != REGX_T_RPAREN || !matchingRParen)) 00278 { 00279 if (concatTok == 0) { 00280 00281 concatTok = fTokenFactory->createUnion(true); 00282 concatTok->addChild(tok, fTokenFactory); 00283 tok = concatTok; 00284 } 00285 concatTok->addChild(parseFactor(), fTokenFactory); 00286 } 00287 00288 return tok; 00289 } 00290 } 00291 00292 00293 Token* RegxParser::processCaret() { 00294 00295 processNext(); 00296 return fTokenFactory->getLineBegin(); 00297 } 00298 00299 00300 Token* RegxParser::processDollar() { 00301 00302 processNext(); 00303 return fTokenFactory->getLineEnd(); 00304 } 00305 00306 00307 Token* RegxParser::processStar(Token* const tok) { 00308 00309 processNext(); 00310 00311 if (fState == REGX_T_QUESTION) { 00312 processNext(); 00313 return fTokenFactory->createClosure(tok, true); 00314 } 00315 00316 return fTokenFactory->createClosure(tok); 00317 } 00318 00319 00320 Token* RegxParser::processPlus(Token* const tok) { 00321 00322 processNext(); 00323 00324 if (fState == REGX_T_QUESTION) { 00325 processNext(); 00326 return fTokenFactory->createConcat(tok, 00327 fTokenFactory->createClosure(tok,true)); 00328 } 00329 00330 return fTokenFactory->createConcat(tok, 00331 fTokenFactory->createClosure(tok)); 00332 } 00333 00334 00335 Token* RegxParser::processQuestion(Token* const tok) { 00336 00337 processNext(); 00338 00339 Token* parentTok = fTokenFactory->createUnion(); 00340 00341 if (fState == REGX_T_QUESTION) { 00342 processNext(); 00343 parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory); 00344 parentTok->addChild(tok, fTokenFactory); 00345 } 00346 else { 00347 parentTok->addChild(tok, fTokenFactory); 00348 parentTok->addChild(fTokenFactory->createToken(Token::T_EMPTY), fTokenFactory); 00349 } 00350 00351 return parentTok; 00352 } 00353 00354 00355 Token* RegxParser::processParen() { 00356 00357 processNext(); 00358 int num = fNoGroups++; 00359 Token* tok = fTokenFactory->createParenthesis(parseRegx(true),num); 00360 00361 if (fState != REGX_T_RPAREN) 00362 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Factor1, fMemoryManager); 00363 00364 processNext(); 00365 return tok; 00366 } 00367 00368 00369 Token* RegxParser::processBackReference() { 00370 00371 XMLSize_t position = fOffset - 2; 00372 00373 // Handle multi digit back references 00374 int refNo = fCharData - chDigit_0; 00375 while(true) { 00376 processNext(); 00377 if(fState != REGX_T_CHAR || fCharData < chDigit_0 || fCharData > chDigit_9) 00378 break; 00379 00380 int nextRefNo = (refNo * 10) + fCharData - chDigit_0; 00381 if(nextRefNo >= fNoGroups) 00382 break; 00383 00384 refNo = nextRefNo; 00385 } 00386 00387 Token* tok = fTokenFactory->createBackReference(refNo); 00388 00389 fHasBackReferences = true; 00390 if (fReferences == 0) { 00391 fReferences = new (fMemoryManager) RefVectorOf<ReferencePosition>(8, true, fMemoryManager); 00392 } 00393 00394 fReferences->addElement(new (fMemoryManager) ReferencePosition(refNo, position)); 00395 return tok; 00396 } 00397 00398 00399 Token* RegxParser::parseFactor() { 00400 00401 Token* tok = parseAtom(); 00402 00403 switch(fState) { 00404 00405 case REGX_T_STAR: 00406 return processStar(tok); 00407 case REGX_T_PLUS: 00408 return processPlus(tok); 00409 case REGX_T_QUESTION: 00410 return processQuestion(tok); 00411 case REGX_T_CHAR: 00412 if (fCharData == chOpenCurly && fOffset < fStringLen) { 00413 00414 int min = 0; 00415 int max = -1; 00416 XMLInt32 ch = fString[fOffset++]; 00417 00418 if (ch >= chDigit_0 && ch <= chDigit_9) { 00419 00420 min = ch - chDigit_0; 00421 while (fOffset < fStringLen 00422 && (ch = fString[fOffset++]) >= chDigit_0 00423 && ch <= chDigit_9) { 00424 00425 min = min*10 + ch - chDigit_0; 00426 } 00427 00428 if (min < 0) 00429 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager); 00430 } 00431 else { 00432 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier1, fString, fMemoryManager); 00433 } 00434 00435 max = min; 00436 00437 if (ch == chComma) { 00438 00439 if (fOffset >= fStringLen) { 00440 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier3, fString, fMemoryManager); 00441 } 00442 else if ((ch = fString[fOffset++]) >= chDigit_0 && ch <= chDigit_9) { 00443 00444 max = ch - chDigit_0; 00445 while (fOffset < fStringLen 00446 && (ch = fString[fOffset++]) >= chDigit_0 00447 && ch <= chDigit_9) { 00448 00449 max = max*10 + ch - chDigit_0; 00450 } 00451 00452 if (max < 0) 00453 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier5, fString, fMemoryManager); 00454 else if (min > max) 00455 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier4, fString, fMemoryManager); 00456 } 00457 else { 00458 max = -1; 00459 } 00460 } 00461 00462 if (ch != chCloseCurly) { 00463 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Parser_Quantifier2, fString, fMemoryManager); 00464 } 00465 00466 if (checkQuestion(fOffset)) { 00467 00468 tok = fTokenFactory->createClosure(tok, true); 00469 fOffset++; 00470 } 00471 else { 00472 tok = fTokenFactory->createClosure(tok); 00473 } 00474 00475 tok->setMin(min); 00476 tok->setMax(max); 00477 processNext(); 00478 } 00479 break; 00480 default: 00481 break; 00482 } 00483 00484 return tok; 00485 } 00486 00487 00488 Token* RegxParser::parseAtom() { 00489 00490 Token* tok = 0; 00491 00492 switch(fState) { 00493 00494 case REGX_T_LPAREN: 00495 return processParen(); 00496 case REGX_T_DOT: 00497 processNext(); 00498 tok = fTokenFactory->getDot(); 00499 break; 00500 case REGX_T_CARET: 00501 return processCaret(); 00502 case REGX_T_DOLLAR: 00503 return processDollar(); 00504 case REGX_T_LBRACKET: 00505 return parseCharacterClass(true); 00506 case REGX_T_BACKSOLIDUS: 00507 switch(fCharData) { 00508 00509 case chLatin_d: 00510 case chLatin_D: 00511 case chLatin_w: 00512 case chLatin_W: 00513 case chLatin_s: 00514 case chLatin_S: 00515 case chLatin_c: 00516 case chLatin_C: 00517 case chLatin_i: 00518 case chLatin_I: 00519 tok = getTokenForShorthand(fCharData); 00520 processNext(); 00521 return tok; 00522 case chDigit_0: 00523 case chDigit_1: 00524 case chDigit_2: 00525 case chDigit_3: 00526 case chDigit_4: 00527 case chDigit_5: 00528 case chDigit_6: 00529 case chDigit_7: 00530 case chDigit_8: 00531 case chDigit_9: 00532 return processBackReference(); 00533 case chLatin_p: 00534 case chLatin_P: 00535 { 00536 tok = processBacksolidus_pP(fCharData); 00537 if (tok == 0) { 00538 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, fMemoryManager); 00539 } 00540 } 00541 break; 00542 default: 00543 { 00544 XMLInt32 ch = decodeEscaped(); 00545 if (ch < 0x10000) { 00546 tok = fTokenFactory->createChar(ch); 00547 } 00548 else { 00549 00550 XMLCh* surrogateStr = RegxUtil::decomposeToSurrogates(ch, fMemoryManager); 00551 ArrayJanitor<XMLCh> janSurrogate(surrogateStr, fMemoryManager); 00552 tok = fTokenFactory->createString(surrogateStr); 00553 } 00554 } 00555 break; 00556 } // end switch 00557 00558 processNext(); 00559 break; 00560 case REGX_T_CHAR: 00561 if (fCharData == chOpenCurly 00562 || fCharData == chCloseCurly 00563 || fCharData == chCloseSquare) 00564 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager); 00565 00566 tok = fTokenFactory->createChar(fCharData); 00567 processNext(); 00568 break; 00569 default: 00570 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom4, fMemoryManager); 00571 } //end switch 00572 00573 return tok; 00574 } 00575 00576 00577 RangeToken* RegxParser::processBacksolidus_pP(const XMLInt32 ch) { 00578 00579 processNext(); 00580 00581 if (fState != REGX_T_CHAR || fCharData != chOpenCurly) 00582 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom2, fMemoryManager); 00583 00584 XMLSize_t nameStart = fOffset; 00585 int nameEnd = XMLString::indexOf(fString,chCloseCurly,nameStart, fMemoryManager); 00586 00587 if (nameEnd < 0) 00588 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom3, fMemoryManager); 00589 00590 fOffset = nameEnd + 1; 00591 XMLCh* rangeName = (XMLCh*) fMemoryManager->allocate 00592 ( 00593 (nameEnd - nameStart + 1) * sizeof(XMLCh) 00594 );//new XMLCh[(nameEnd - nameStart) + 1]; 00595 ArrayJanitor<XMLCh> janRangeName(rangeName, fMemoryManager); 00596 XMLString::subString(rangeName, fString, nameStart, nameEnd, fMemoryManager); 00597 00598 return fTokenFactory->getRange(rangeName, !(ch == chLatin_p)); 00599 } 00600 00601 RangeToken* RegxParser::parseCharacterClass(const bool useNRange) { 00602 00603 setParseContext(regexParserStateInBrackets); 00604 processNext(); 00605 00606 RangeToken* tok = 0; 00607 bool isNRange = false; 00608 00609 if (getState() == REGX_T_CHAR && getCharData() == chCaret) { 00610 isNRange = true; 00611 processNext(); 00612 } 00613 tok = fTokenFactory->createRange(); 00614 00615 parserState type; 00616 bool firstLoop = true; 00617 bool wasDecoded; 00618 00619 while ( (type = getState()) != REGX_T_EOF) { 00620 00621 wasDecoded = false; 00622 00623 // single range | from-to-range | subtraction 00624 if (type == REGX_T_CHAR && getCharData() == chCloseSquare && !firstLoop) 00625 break; 00626 00627 XMLInt32 ch = getCharData(); 00628 bool end = false; 00629 00630 if (type == REGX_T_BACKSOLIDUS) { 00631 00632 switch(ch) { 00633 case chLatin_d: 00634 case chLatin_D: 00635 case chLatin_w: 00636 case chLatin_W: 00637 case chLatin_s: 00638 case chLatin_S: 00639 case chLatin_i: 00640 case chLatin_I: 00641 case chLatin_c: 00642 case chLatin_C: 00643 { 00644 tok->mergeRanges(getTokenForShorthand(ch)); 00645 end = true; 00646 } 00647 break; 00648 case chLatin_p: 00649 case chLatin_P: 00650 { 00651 RangeToken* tok2 = processBacksolidus_pP(ch); 00652 00653 if (tok2 == 0) { 00654 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Atom5, getMemoryManager()); 00655 } 00656 00657 tok->mergeRanges(tok2); 00658 end = true; 00659 } 00660 break; 00661 case chDash: 00662 wasDecoded = true; 00663 // fall thru to default. 00664 default: 00665 ch = decodeEscaped(); 00666 } 00667 } // end if REGX_T_BACKSOLIDUS 00668 else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION && !firstLoop) { 00669 00670 if (isNRange) 00671 { 00672 tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); 00673 isNRange=false; 00674 } 00675 RangeToken* rangeTok = parseCharacterClass(false); 00676 tok->subtractRanges(rangeTok); 00677 00678 if (getState() != REGX_T_CHAR || getCharData() != chCloseSquare) { 00679 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC5, getMemoryManager()); 00680 } 00681 break; 00682 } // end if REGX_T_XMLSCHEMA... 00683 00684 processNext(); 00685 00686 if (!end) { 00687 00688 if (type == REGX_T_CHAR 00689 && (ch == chOpenSquare 00690 || ch == chCloseSquare 00691 || (ch == chDash && getCharData() == chCloseSquare && firstLoop))) { 00692 // if regex = [-] then invalid... 00693 // '[', ']', '-' not allowed and should be escaped 00694 XMLCh chStr[] = { ch, chNull }; 00695 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); 00696 } 00697 if (ch == chDash && getCharData() == chDash && getState() != REGX_T_BACKSOLIDUS && !wasDecoded) { 00698 XMLCh chStr[] = { ch, chNull }; 00699 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_CC6, chStr, chStr, getMemoryManager()); 00700 } 00701 00702 if (getState() != REGX_T_CHAR || getCharData() != chDash) { 00703 tok->addRange(ch, ch); 00704 } 00705 else { 00706 00707 processNext(); 00708 if ((type = getState()) == REGX_T_EOF) 00709 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); 00710 00711 if (type == REGX_T_CHAR && getCharData() == chCloseSquare) { 00712 tok->addRange(ch, ch); 00713 tok->addRange(chDash, chDash); 00714 } 00715 else if (type == REGX_T_XMLSCHEMA_CC_SUBTRACTION) { 00716 00717 static const XMLCh dashStr[] = { chDash, chNull}; 00718 ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, dashStr, dashStr, getMemoryManager()); 00719 } 00720 else { 00721 00722 XMLInt32 rangeEnd = getCharData(); 00723 XMLCh rangeEndStr[] = { rangeEnd, chNull }; 00724 00725 if (type == REGX_T_CHAR) { 00726 00727 if (rangeEnd == chOpenSquare 00728 || rangeEnd == chCloseSquare 00729 || rangeEnd == chDash) 00730 // '[', ']', '-' not allowed and should be escaped 00731 ThrowXMLwithMemMgr2(ParseException, XMLExcepts::Parser_CC6, rangeEndStr, rangeEndStr, getMemoryManager()); 00732 } 00733 else if (type == REGX_T_BACKSOLIDUS) { 00734 rangeEnd = decodeEscaped(); 00735 } 00736 00737 processNext(); 00738 00739 if (ch > rangeEnd) { 00740 XMLCh chStr[] = { ch, chNull }; 00741 ThrowXMLwithMemMgr2(ParseException,XMLExcepts::Parser_Ope3, rangeEndStr, chStr, getMemoryManager()); 00742 } 00743 00744 tok->addRange(ch, rangeEnd); 00745 } 00746 } 00747 } 00748 firstLoop = false; 00749 } 00750 00751 if (getState() == REGX_T_EOF) 00752 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_CC2, getMemoryManager()); 00753 00754 if (isNRange) 00755 { 00756 if(useNRange) 00757 tok->setTokenType(Token::T_NRANGE); 00758 else 00759 tok = RangeToken::complementRanges(tok, fTokenFactory, fMemoryManager); 00760 } 00761 00762 tok->sortRanges(); 00763 tok->compactRanges(); 00764 00765 // If the case-insensitive option is enabled, we need to 00766 // have the new RangeToken instance build its internal 00767 // case-insensitive RangeToken. 00768 if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE)) 00769 { 00770 tok->getCaseInsensitiveToken(fTokenFactory); 00771 } 00772 00773 setParseContext(regexParserStateNormal); 00774 processNext(); 00775 00776 return tok; 00777 } 00778 00779 00780 RangeToken* RegxParser::getTokenForShorthand(const XMLInt32 ch) { 00781 00782 switch(ch) { 00783 case chLatin_d: 00784 return fTokenFactory->getRange(fgUniDecimalDigit); 00785 //return fTokenFactory->getRange(fgXMLDigit); 00786 case chLatin_D: 00787 return fTokenFactory->getRange(fgUniDecimalDigit, true); 00788 //return fTokenFactory->getRange(fgXMLDigit, true); 00789 case chLatin_w: 00790 return fTokenFactory->getRange(fgXMLWord); 00791 case chLatin_W: 00792 return fTokenFactory->getRange(fgXMLWord, true); 00793 case chLatin_s: 00794 return fTokenFactory->getRange(fgXMLSpace); 00795 case chLatin_S: 00796 return fTokenFactory->getRange(fgXMLSpace, true); 00797 case chLatin_c: 00798 return fTokenFactory->getRange(fgXMLNameChar); 00799 case chLatin_C: 00800 return fTokenFactory->getRange(fgXMLNameChar, true); 00801 case chLatin_i: 00802 return fTokenFactory->getRange(fgXMLInitialNameChar); 00803 case chLatin_I: 00804 return fTokenFactory->getRange(fgXMLInitialNameChar, true); 00805 // default: 00806 // ThrowXMLwithMemMgr(RuntimeException, "Invalid shorthand {0}", chAsString) 00807 } 00808 00809 return 0; 00810 } 00811 00812 00813 XMLInt32 RegxParser::decodeEscaped() { 00814 00815 if (fState != REGX_T_BACKSOLIDUS) 00816 ThrowXMLwithMemMgr(ParseException,XMLExcepts::Parser_Next1, getMemoryManager()); 00817 00818 XMLInt32 ch = fCharData; 00819 00820 switch (ch) { 00821 case chLatin_n: 00822 ch = chLF; 00823 break; 00824 case chLatin_r: 00825 ch = chCR; 00826 break; 00827 case chLatin_t: 00828 ch = chHTab; 00829 break; 00830 case chBackSlash: 00831 case chPipe: 00832 case chPeriod: 00833 case chCaret: 00834 case chDash: 00835 case chQuestion: 00836 case chAsterisk: 00837 case chPlus: 00838 case chOpenCurly: 00839 case chCloseCurly: 00840 case chOpenParen: 00841 case chCloseParen: 00842 case chOpenSquare: 00843 case chCloseSquare: 00844 case chDollarSign: 00845 break; 00846 default: 00847 { 00848 XMLCh chString[] = {chBackSlash, ch, chNull}; 00849 ThrowXMLwithMemMgr1(ParseException,XMLExcepts::Parser_Process2, chString, getMemoryManager()); 00850 } 00851 } 00852 00853 return ch; 00854 } 00855 00856 // --------------------------------------------------------------------------- 00857 // RegxParser: Helper Methods 00858 // --------------------------------------------------------------------------- 00859 bool RegxParser::checkQuestion(const XMLSize_t off) { 00860 00861 return ((off < fStringLen) && fString[off] == chQuestion); 00862 } 00863 00864 XERCES_CPP_NAMESPACE_END 00865