GME
13
|
00001 /* 00002 * Licensed to the Apache Software Foundation (ASF) under one or more 00003 * contributor license agreements. See the NOTICE file distributed with 00004 * this work for additional information regarding copyright ownership. 00005 * The ASF licenses this file to You under the Apache License, Version 2.0 00006 * (the "License"); you may not use this file except in compliance with 00007 * the License. You may obtain a copy of the License at 00008 * 00009 * http://www.apache.org/licenses/LICENSE-2.0 00010 * 00011 * Unless required by applicable law or agreed to in writing, software 00012 * distributed under the License is distributed on an "AS IS" BASIS, 00013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 * See the License for the specific language governing permissions and 00015 * limitations under the License. 00016 */ 00017 00018 /* 00019 * $Id: RegularExpression.cpp 822158 2009-10-06 07:52:59Z amassari $ 00020 */ 00021 00022 // --------------------------------------------------------------------------- 00023 // Includes 00024 // --------------------------------------------------------------------------- 00025 #include <xercesc/util/regx/RegularExpression.hpp> 00026 #include <xercesc/util/PlatformUtils.hpp> 00027 #include <xercesc/util/regx/Match.hpp> 00028 #include <xercesc/util/regx/RangeToken.hpp> 00029 #include <xercesc/util/regx/RegxDefs.hpp> 00030 #include <xercesc/util/regx/XMLUniCharacter.hpp> 00031 #include <xercesc/util/regx/ParserForXMLSchema.hpp> 00032 #include <xercesc/util/Janitor.hpp> 00033 #include <xercesc/util/ParseException.hpp> 00034 #include <xercesc/util/IllegalArgumentException.hpp> 00035 #include <xercesc/framework/XMLBuffer.hpp> 00036 #include <xercesc/util/OutOfMemoryException.hpp> 00037 #include <xercesc/util/XMLInitializer.hpp> 00038 #include <xercesc/util/XMLUniDefs.hpp> 00039 #include <xercesc/util/ValueStackOf.hpp> 00040 00041 XERCES_CPP_NAMESPACE_BEGIN 00042 00043 // --------------------------------------------------------------------------- 00044 // Static member data initialization 00045 // --------------------------------------------------------------------------- 00046 const unsigned int RegularExpression::IGNORE_CASE = 2; 00047 const unsigned int RegularExpression::SINGLE_LINE = 4; 00048 const unsigned int RegularExpression::MULTIPLE_LINE = 8; 00049 const unsigned int RegularExpression::EXTENDED_COMMENT = 16; 00050 const unsigned int RegularExpression::PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 128; 00051 const unsigned int RegularExpression::PROHIBIT_FIXED_STRING_OPTIMIZATION = 256; 00052 const unsigned int RegularExpression::XMLSCHEMA_MODE = 512; 00053 RangeToken* RegularExpression::fWordRange = 0; 00054 00055 bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, 00056 const XMLInt32 ch2) const 00057 { 00058 if (ch1 >= 0x10000) 00059 { 00060 XMLCh string1[2]; 00061 XMLCh string2[2]; 00062 00063 RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]); 00064 00065 if (ch2 >= 0x10000) 00066 { 00067 RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); 00068 } 00069 else 00070 { 00071 // XMLString::compareNIString is broken, because it assume the 00072 // two strings must be of the same length. Note that two strings 00073 // of different length could compare as equal, because there is no 00074 // guarantee that a Unicode code point that is encoded in UTF-16 as 00075 // a surrogate pair does not have a case mapping to a code point 00076 // that is not in the surrogate range. Just to be safe, we pad the 00077 // shorter string with a space, which cannot hvae a case mapping. 00078 string2[0] = (XMLCh)ch2; 00079 string2[1] = chSpace; 00080 } 00081 00082 return (0==XMLString::compareNIString(string1, string2, 2)); 00083 } 00084 else if (ch2 >= 0x10000) 00085 { 00086 const XMLCh string1[2] = { (XMLCh)ch1, chSpace }; 00087 XMLCh string2[2]; 00088 00089 RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); 00090 00091 return (0==XMLString::compareNIString(string1, string2, 2)); 00092 } 00093 else 00094 { 00095 const XMLCh char1 = (XMLCh)ch1; 00096 const XMLCh char2 = (XMLCh)ch2; 00097 00098 return (0==XMLString::compareNIString(&char1, &char2, 1)); 00099 } 00100 } 00101 00102 00103 00104 // --------------------------------------------------------------------------- 00105 // RegularExpression::Context: Constructors and Destructor 00106 // --------------------------------------------------------------------------- 00107 RegularExpression::Context::Context(MemoryManager* const manager) : 00108 fAdoptMatch(false) 00109 , fStart(0) 00110 , fLimit(0) 00111 , fLength(0) 00112 , fSize(0) 00113 , fStringMaxLen(0) 00114 , fOffsets(0) 00115 , fMatch(0) 00116 , fString(0) 00117 , fOptions(0) 00118 , fMemoryManager(manager) 00119 { 00120 } 00121 00122 RegularExpression::Context::Context(Context* src) : 00123 fAdoptMatch(false) 00124 , fStart(src->fStart) 00125 , fLimit(src->fLimit) 00126 , fLength(src->fLength) 00127 , fSize(src->fSize) 00128 , fStringMaxLen(src->fStringMaxLen) 00129 , fOffsets(0) 00130 , fMatch(0) 00131 , fString(src->fString) 00132 , fOptions(src->fOptions) 00133 , fMemoryManager(src->fMemoryManager) 00134 { 00135 if(src->fOffsets) 00136 { 00137 fOffsets = (int*) fMemoryManager->allocate(fSize* sizeof(int)); 00138 for (int i = 0; i< fSize; i++) 00139 fOffsets[i] = src->fOffsets[i]; 00140 } 00141 if(src->fMatch) 00142 { 00143 fMatch=new (fMemoryManager) Match(*src->fMatch); 00144 fAdoptMatch=true; 00145 } 00146 } 00147 00148 RegularExpression::Context& RegularExpression::Context::operator=(const RegularExpression::Context& other) 00149 { 00150 if (this != &other) 00151 { 00152 fStart=other.fStart; 00153 fLimit=other.fLimit; 00154 fLength=other.fLength; 00155 fStringMaxLen=other.fStringMaxLen; 00156 fString=other.fString; 00157 fOptions=other.fOptions; 00158 00159 // if offset and match are already allocated with the right size, reuse them 00160 // (fMatch can be provided by the user to get the data back) 00161 if(fMatch && other.fMatch && fMatch->getNoGroups()==other.fMatch->getNoGroups()) 00162 *fMatch=*other.fMatch; 00163 else 00164 { 00165 if (fAdoptMatch) 00166 delete fMatch; 00167 fMatch=0; 00168 if(other.fMatch) 00169 { 00170 fMatch=new (other.fMemoryManager) Match(*other.fMatch); 00171 fAdoptMatch=true; 00172 } 00173 } 00174 00175 if (fOffsets && other.fOffsets && fSize==other.fSize) 00176 { 00177 for (int i = 0; i< fSize; i++) 00178 fOffsets[i] = other.fOffsets[i]; 00179 } 00180 else 00181 { 00182 if(fOffsets) 00183 fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; 00184 fOffsets=0; 00185 fSize=other.fSize; 00186 if(other.fOffsets) 00187 { 00188 fOffsets = (int*) other.fMemoryManager->allocate(fSize* sizeof(int)); 00189 for (int i = 0; i< fSize; i++) 00190 fOffsets[i] = other.fOffsets[i]; 00191 } 00192 } 00193 00194 fMemoryManager=other.fMemoryManager; 00195 } 00196 00197 return *this; 00198 } 00199 00200 RegularExpression::Context::~Context() 00201 { 00202 if (fOffsets) 00203 fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; 00204 00205 if (fAdoptMatch) 00206 delete fMatch; 00207 } 00208 00209 // --------------------------------------------------------------------------- 00210 // RegularExpression::Context: Public methods 00211 // --------------------------------------------------------------------------- 00212 void RegularExpression::Context::reset(const XMLCh* const string 00213 , const XMLSize_t stringLen 00214 , const XMLSize_t start 00215 , const XMLSize_t limit 00216 , const int noClosures 00217 , const unsigned int options) 00218 { 00219 fString = string; 00220 fStringMaxLen = stringLen; 00221 fStart = start; 00222 fLimit = limit; 00223 fLength = fLimit - fStart; 00224 if (fAdoptMatch) 00225 delete fMatch; 00226 fMatch = 0; 00227 00228 if (fSize != noClosures) { 00229 if (fOffsets) 00230 fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; 00231 fOffsets = (int*) fMemoryManager->allocate(noClosures * sizeof(int));//new int[noClosures]; 00232 } 00233 00234 fSize = noClosures; 00235 fOptions = options; 00236 00237 for (int i = 0; i< fSize; i++) 00238 fOffsets[i] = -1; 00239 } 00240 00241 bool RegularExpression::Context::nextCh(XMLInt32& ch, XMLSize_t& offset) 00242 { 00243 ch = fString[offset]; 00244 00245 if (RegxUtil::isHighSurrogate(ch)) { 00246 if ((offset + 1 < fLimit) && RegxUtil::isLowSurrogate(fString[offset+1])) { 00247 ch = RegxUtil::composeFromSurrogate(ch, fString[++offset]); 00248 } 00249 else return false; 00250 } 00251 else if (RegxUtil::isLowSurrogate(ch)) { 00252 return false; 00253 } 00254 00255 return true; 00256 } 00257 00258 // --------------------------------------------------------------------------- 00259 // RegularExpression: Constructors and Destructors 00260 // --------------------------------------------------------------------------- 00261 00262 typedef JanitorMemFunCall<RegularExpression> CleanupType; 00263 00264 RegularExpression::RegularExpression(const char* const pattern, 00265 MemoryManager* const manager) 00266 :fHasBackReferences(false), 00267 fFixedStringOnly(false), 00268 fNoGroups(0), 00269 fMinLength(0), 00270 fNoClosures(0), 00271 fOptions(0), 00272 fBMPattern(0), 00273 fPattern(0), 00274 fFixedString(0), 00275 fOperations(0), 00276 fTokenTree(0), 00277 fFirstChar(0), 00278 fOpFactory(manager), 00279 fTokenFactory(0), 00280 fMemoryManager(manager) 00281 { 00282 CleanupType cleanup(this, &RegularExpression::cleanUp); 00283 00284 try { 00285 00286 XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager); 00287 ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); 00288 setPattern(tmpBuf); 00289 } 00290 catch(const OutOfMemoryException&) 00291 { 00292 cleanup.release(); 00293 00294 throw; 00295 } 00296 00297 cleanup.release(); 00298 } 00299 00300 RegularExpression::RegularExpression(const char* const pattern, 00301 const char* const options, 00302 MemoryManager* const manager) 00303 :fHasBackReferences(false), 00304 fFixedStringOnly(false), 00305 fNoGroups(0), 00306 fMinLength(0), 00307 fNoClosures(0), 00308 fOptions(0), 00309 fBMPattern(0), 00310 fPattern(0), 00311 fFixedString(0), 00312 fOperations(0), 00313 fTokenTree(0), 00314 fFirstChar(0), 00315 fOpFactory(manager), 00316 fTokenFactory(0), 00317 fMemoryManager(manager) 00318 { 00319 CleanupType cleanup(this, &RegularExpression::cleanUp); 00320 00321 try { 00322 00323 XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager); 00324 ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); 00325 XMLCh* tmpOptions = XMLString::transcode(options, fMemoryManager); 00326 ArrayJanitor<XMLCh> janOps(tmpOptions, fMemoryManager); 00327 setPattern(tmpBuf, tmpOptions); 00328 } 00329 catch(const OutOfMemoryException&) 00330 { 00331 cleanup.release(); 00332 00333 throw; 00334 } 00335 00336 cleanup.release(); 00337 } 00338 00339 00340 RegularExpression::RegularExpression(const XMLCh* const pattern, 00341 MemoryManager* const manager) 00342 :fHasBackReferences(false), 00343 fFixedStringOnly(false), 00344 fNoGroups(0), 00345 fMinLength(0), 00346 fNoClosures(0), 00347 fOptions(0), 00348 fBMPattern(0), 00349 fPattern(0), 00350 fFixedString(0), 00351 fOperations(0), 00352 fTokenTree(0), 00353 fFirstChar(0), 00354 fOpFactory(manager), 00355 fTokenFactory(0), 00356 fMemoryManager(manager) 00357 { 00358 CleanupType cleanup(this, &RegularExpression::cleanUp); 00359 00360 try { 00361 00362 setPattern(pattern); 00363 } 00364 catch(const OutOfMemoryException&) 00365 { 00366 cleanup.release(); 00367 00368 throw; 00369 } 00370 00371 cleanup.release(); 00372 } 00373 00374 RegularExpression::RegularExpression(const XMLCh* const pattern, 00375 const XMLCh* const options, 00376 MemoryManager* const manager) 00377 :fHasBackReferences(false), 00378 fFixedStringOnly(false), 00379 fNoGroups(0), 00380 fMinLength(0), 00381 fNoClosures(0), 00382 fOptions(0), 00383 fBMPattern(0), 00384 fPattern(0), 00385 fFixedString(0), 00386 fOperations(0), 00387 fTokenTree(0), 00388 fFirstChar(0), 00389 fOpFactory(manager), 00390 fTokenFactory(0), 00391 fMemoryManager(manager) 00392 { 00393 CleanupType cleanup(this, &RegularExpression::cleanUp); 00394 00395 try { 00396 00397 setPattern(pattern, options); 00398 } 00399 catch(const OutOfMemoryException&) 00400 { 00401 cleanup.release(); 00402 00403 throw; 00404 } 00405 00406 cleanup.release(); 00407 } 00408 00409 RegularExpression::~RegularExpression() { 00410 00411 cleanUp(); 00412 } 00413 00414 // --------------------------------------------------------------------------- 00415 // RegularExpression: Setter methods 00416 // --------------------------------------------------------------------------- 00417 00418 RegxParser* RegularExpression::getRegexParser(const int options, MemoryManager* const manager) 00419 { 00420 // the following construct causes an error in an Intel 7.1 32 bit compiler for 00421 // red hat linux 7.2 00422 // (when an exception is thrown the wrong object is deleted) 00423 //RegxParser* regxParser = isSet(fOptions, XMLSCHEMA_MODE) 00424 // ? new (fMemoryManager) ParserForXMLSchema(fMemoryManager) 00425 // : new (fMemoryManager) RegxParser(fMemoryManager); 00426 if (isSet(options, XMLSCHEMA_MODE)) 00427 return new (manager) ParserForXMLSchema(manager); 00428 00429 return new (manager) RegxParser(manager); 00430 } 00431 00432 void RegularExpression::setPattern(const XMLCh* const pattern, 00433 const XMLCh* const options) 00434 { 00435 00436 fTokenFactory = new (fMemoryManager) TokenFactory(fMemoryManager); 00437 fOptions = parseOptions(options); 00438 fPattern = XMLString::replicate(pattern, fMemoryManager); 00439 00440 RegxParser* regxParser=getRegexParser(fOptions, fMemoryManager); 00441 00442 if (regxParser) 00443 regxParser->setTokenFactory(fTokenFactory); 00444 00445 Janitor<RegxParser> janRegxParser(regxParser); 00446 fTokenTree = regxParser->parse(fPattern, fOptions); 00447 fNoGroups = regxParser->getNoParen(); 00448 fHasBackReferences = regxParser->hasBackReferences(); 00449 00450 prepare(); 00451 } 00452 00453 // --------------------------------------------------------------------------- 00454 // RegularExpression: Matching methods 00455 // --------------------------------------------------------------------------- 00456 bool RegularExpression::matches(const char* const expression 00457 , MemoryManager* const manager) const 00458 { 00459 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00460 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00461 return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), 0, manager); 00462 } 00463 00464 bool RegularExpression::matches(const char* const expression 00465 , const XMLSize_t start, const XMLSize_t end 00466 , MemoryManager* const manager) const 00467 { 00468 00469 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00470 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00471 return matches(tmpBuf, start, end, 0, manager); 00472 } 00473 00474 bool RegularExpression::matches(const char* const expression 00475 , Match* const match 00476 , MemoryManager* const manager) const 00477 { 00478 00479 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00480 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00481 return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), match, manager); 00482 } 00483 00484 bool RegularExpression::matches(const char* const expression, const XMLSize_t start 00485 , const XMLSize_t end, Match* const pMatch 00486 , MemoryManager* const manager) const 00487 { 00488 00489 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00490 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00491 return matches(tmpBuf, start, end, pMatch, manager); 00492 } 00493 00494 00495 // --------------------------------------------------------------------------- 00496 // RegularExpression: Matching methods - Wide char version 00497 // --------------------------------------------------------------------------- 00498 bool RegularExpression::matches(const XMLCh* const expression, MemoryManager* const manager) const 00499 { 00500 return matches(expression, 0, XMLString::stringLen(expression), 0, manager); 00501 } 00502 00503 bool RegularExpression::matches(const XMLCh* const expression 00504 , const XMLSize_t start, const XMLSize_t end 00505 , MemoryManager* const manager) const 00506 { 00507 return matches(expression, start, end, 0, manager); 00508 } 00509 00510 bool RegularExpression::matches(const XMLCh* const expression 00511 , Match* const match 00512 , MemoryManager* const manager) const 00513 { 00514 return matches(expression, 0, XMLString::stringLen(expression), match, manager); 00515 } 00516 00517 bool RegularExpression::matches(const XMLCh* const expression, const XMLSize_t start 00518 , const XMLSize_t end, Match* const pMatch 00519 , MemoryManager* const manager) const 00520 { 00521 00522 Context context(manager); 00523 XMLSize_t strLength = XMLString::stringLen(expression); 00524 00525 context.reset(expression, strLength, start, end, fNoClosures, fOptions); 00526 00527 bool adoptMatch = false; 00528 Match* lMatch = pMatch; 00529 00530 if (lMatch != 0) { 00531 lMatch->setNoGroups(fNoGroups); 00532 } 00533 else if (fHasBackReferences) { 00534 lMatch = new (manager) Match(manager); 00535 lMatch->setNoGroups(fNoGroups); 00536 adoptMatch = true; 00537 } 00538 00539 if (context.fAdoptMatch) 00540 delete context.fMatch; 00541 context.fMatch = lMatch; 00542 context.fAdoptMatch = adoptMatch; 00543 00544 if (isSet(fOptions, XMLSCHEMA_MODE)) { 00545 00546 int matchEnd = match(&context, fOperations, context.fStart); 00547 00548 if (matchEnd == (int)context.fLimit) { 00549 00550 if (context.fMatch != 0) { 00551 00552 context.fMatch->setStartPos(0, (int)context.fStart); 00553 context.fMatch->setEndPos(0, matchEnd); 00554 } 00555 return true; 00556 } 00557 00558 return false; 00559 } 00560 00561 /* 00562 * If the pattern has only fixed string, use Boyer-Moore 00563 */ 00564 if (fFixedStringOnly) { 00565 00566 int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); 00567 if (ret >= 0) { 00568 00569 if (context.fMatch != 0) { 00570 context.fMatch->setStartPos(0, ret); 00571 context.fMatch->setEndPos(0, (int)(ret + XMLString::stringLen(fPattern))); 00572 } 00573 return true; 00574 } 00575 return false; 00576 } 00577 00578 /* 00579 * If the pattern contains a fixed string, we check with Boyer-Moore 00580 * whether the text contains the fixed string or not. If not found 00581 * return false 00582 */ 00583 if (fFixedString != 0) { 00584 00585 int ret = fBMPattern->matches(expression, context.fStart, context.fLimit); 00586 00587 if (ret < 0) { // No match 00588 return false; 00589 } 00590 } 00591 00592 // if the length is less than the minimum length, we cannot possibly match 00593 if(context.fLimit<fMinLength) 00594 return false; 00595 00596 XMLSize_t limit = context.fLimit - fMinLength; 00597 XMLSize_t matchStart; 00598 int matchEnd = -1; 00599 00600 /* 00601 * Check whether the expression start with ".*" 00602 */ 00603 if (fOperations != 0 && (fOperations->getOpType() == Op::O_CLOSURE || fOperations->getOpType() == Op::O_FINITE_CLOSURE) 00604 && fOperations->getChild()->getOpType() == Op::O_DOT) { 00605 00606 if (isSet(fOptions, SINGLE_LINE)) { 00607 matchStart = context.fStart; 00608 matchEnd = match(&context, fOperations, matchStart); 00609 } 00610 else { 00611 bool previousIsEOL = true; 00612 00613 for (matchStart=context.fStart; matchStart<=limit; matchStart++) { 00614 00615 XMLCh ch = expression[matchStart]; 00616 if (RegxUtil::isEOLChar(ch)) { 00617 previousIsEOL = true; 00618 } 00619 else { 00620 00621 if (previousIsEOL) { 00622 if (0 <= (matchEnd = match(&context, fOperations, 00623 matchStart))) 00624 break; 00625 } 00626 00627 previousIsEOL = false; 00628 } 00629 } 00630 } 00631 } 00632 else { 00633 /* 00634 * Optimization against the first char 00635 */ 00636 if (fFirstChar != 0) { 00637 bool ignoreCase = isSet(fOptions, IGNORE_CASE); 00638 RangeToken* range = fFirstChar; 00639 00640 if (ignoreCase) 00641 range = fFirstChar->getCaseInsensitiveToken(fTokenFactory); 00642 00643 for (matchStart=context.fStart; matchStart<=limit; matchStart++) { 00644 00645 XMLInt32 ch; 00646 00647 if (!context.nextCh(ch, matchStart)) 00648 break; 00649 00650 if (!range->match(ch)) 00651 continue; 00652 00653 if (0 <= (matchEnd = match(&context,fOperations,matchStart))) 00654 break; 00655 } 00656 } 00657 else { 00658 00659 /* 00660 * Straightforward matching 00661 */ 00662 for (matchStart=context.fStart; matchStart<=limit; matchStart++) { 00663 00664 if (0 <= (matchEnd = match(&context,fOperations,matchStart))) 00665 break; 00666 } 00667 } 00668 } 00669 00670 if (matchEnd >= 0) { 00671 00672 if (context.fMatch != 0) { 00673 00674 context.fMatch->setStartPos(0, (int)matchStart); 00675 context.fMatch->setEndPos(0, matchEnd); 00676 } 00677 return true; 00678 } 00679 return false; 00680 } 00681 00682 // --------------------------------------------------------------------------- 00683 // RegularExpression: Tokenize methods 00684 // --------------------------------------------------------------------------- 00685 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression, 00686 MemoryManager* const manager) const 00687 { 00688 00689 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00690 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00691 return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf), manager); 00692 } 00693 00694 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression, 00695 const XMLSize_t start, const XMLSize_t end, 00696 MemoryManager* const manager) const 00697 { 00698 00699 XMLCh* tmpBuf = XMLString::transcode(expression, manager); 00700 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00701 return tokenize(tmpBuf, start, end, manager); 00702 } 00703 00704 00705 00706 // --------------------------------------------------------------------------- 00707 // RegularExpression: Tokenize methods - Wide char version 00708 // --------------------------------------------------------------------------- 00709 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, 00710 MemoryManager* const manager) const 00711 { 00712 return tokenize(expression, 0, XMLString::stringLen(expression), manager); 00713 } 00714 00715 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const matchString, 00716 const XMLSize_t start, const XMLSize_t end, 00717 MemoryManager* const manager) const 00718 { 00719 // check if matches zero length string - throw error if so 00720 if(matches(XMLUni::fgZeroLenString, manager)){ 00721 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager); 00722 } 00723 00724 RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager); 00725 Janitor<RefVectorOf<Match> > janSubEx(subEx); 00726 00727 allMatches(matchString, start, end, subEx, manager); 00728 00729 RefArrayVectorOf<XMLCh> *tokens = new (manager) RefArrayVectorOf<XMLCh>(16, true, manager); 00730 XMLSize_t tokStart = start; 00731 00732 XMLSize_t i = 0; 00733 for(; i < subEx->size(); ++i) { 00734 Match *match = subEx->elementAt(i); 00735 XMLSize_t matchStart = match->getStartPos(0); 00736 00737 XMLCh *token = (XMLCh*)manager->allocate((matchStart + 1 - tokStart) * sizeof(XMLCh)); 00738 XMLString::subString(token, matchString, tokStart, matchStart, manager); 00739 tokens->addElement(token); 00740 00741 tokStart = match->getEndPos(0); 00742 } 00743 00744 XMLCh *token = (XMLCh*)manager->allocate((end + 1 - tokStart) * sizeof(XMLCh)); 00745 XMLString::subString(token, matchString, tokStart, end, manager); 00746 tokens->addElement(token); 00747 00748 return tokens; 00749 } 00750 00751 void RegularExpression::allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, 00752 RefVectorOf<Match> *subEx, MemoryManager* const manager) const 00753 { 00754 Context context(manager); 00755 context.reset(matchString, XMLString::stringLen(matchString), start, end, fNoClosures, fOptions); 00756 00757 context.fMatch = new (manager) Match(manager); 00758 context.fMatch->setNoGroups(fNoGroups); 00759 context.fAdoptMatch = true; 00760 00761 XMLSize_t matchStart = start; 00762 while(matchStart <= end) { 00763 XMLSize_t matchEnd = match(&context, fOperations, matchStart); 00764 if(matchEnd != (XMLSize_t)-1) { 00765 context.fMatch->setStartPos(0, (int)matchStart); 00766 context.fMatch->setEndPos(0, (int)matchEnd); 00767 00768 subEx->addElement(context.fMatch); 00769 00770 context.fMatch = new (manager) Match(*(context.fMatch)); 00771 context.fAdoptMatch = true; 00772 00773 matchStart = matchEnd; 00774 } else { 00775 ++matchStart; 00776 } 00777 } 00778 } 00779 00780 00781 // ----------------------------------------------------------------------- 00782 // RegularExpression: Replace methods 00783 // ----------------------------------------------------------------------- 00784 XMLCh* RegularExpression::replace(const char* const matchString, 00785 const char* const replaceString, 00786 MemoryManager* const manager) const 00787 { 00788 00789 XMLCh* tmpBuf = XMLString::transcode(matchString, manager); 00790 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00791 XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager); 00792 ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager); 00793 00794 return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf), manager); 00795 } 00796 00797 XMLCh* RegularExpression::replace(const char* const matchString, 00798 const char* const replaceString, 00799 const XMLSize_t start, const XMLSize_t end, 00800 MemoryManager* const manager) const 00801 { 00802 00803 XMLCh* tmpBuf = XMLString::transcode(matchString, manager); 00804 ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); 00805 XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager); 00806 ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager); 00807 00808 return replace(tmpBuf, tmpBuf2, start, end, manager); 00809 } 00810 00811 00812 // --------------------------------------------------------------------------- 00813 // RegularExpression: Replace methods - Wide char version 00814 // --------------------------------------------------------------------------- 00815 XMLCh* RegularExpression::replace(const XMLCh* const matchString, 00816 const XMLCh* const replaceString, 00817 MemoryManager* const manager) const 00818 { 00819 00820 return replace(matchString, replaceString, 0, 00821 XMLString::stringLen(matchString), manager); 00822 } 00823 00824 XMLCh* RegularExpression::replace(const XMLCh* const matchString, 00825 const XMLCh* const replaceString, 00826 const XMLSize_t start, const XMLSize_t end, 00827 MemoryManager* const manager) const 00828 { 00829 // check if matches zero length string - throw error if so 00830 if(matches(XMLUni::fgZeroLenString, manager)){ 00831 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager); 00832 } 00833 00834 RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager); 00835 Janitor<RefVectorOf<Match> > janSubEx(subEx); 00836 00837 allMatches(matchString, start, end, subEx, manager); 00838 00839 XMLBuffer result(1023, manager); 00840 int tokStart = (int)start; 00841 00842 XMLSize_t i = 0; 00843 for(; i < subEx->size(); ++i) { 00844 Match *match = subEx->elementAt(i); 00845 int matchStart = match->getStartPos(0); 00846 00847 if(matchStart > tokStart) 00848 result.append(matchString + tokStart, matchStart - tokStart); 00849 subInExp(replaceString, matchString, match, result, manager); 00850 00851 tokStart = match->getEndPos(0); 00852 } 00853 00854 if(end > (XMLSize_t)tokStart) 00855 result.append(matchString + tokStart, end - tokStart); 00856 00857 return XMLString::replicate(result.getRawBuffer(), manager); 00858 } 00859 00860 /* 00861 * Helper for Replace. This method prepares the replacement string by substituting 00862 * in actual values for parenthesized sub expressions. 00863 * 00864 * An error will be thrown if: 00865 * 1) there is chBackSlash not followed by a chDollarSign or chBackSlash 00866 * 2) there is an unescaped chDollarSign which is not followed by a digit 00867 * 00868 */ 00869 void RegularExpression::subInExp(const XMLCh* const repString, 00870 const XMLCh* const origString, 00871 const Match* subEx, 00872 XMLBuffer &result, 00873 MemoryManager* const manager) const 00874 { 00875 int numSubExp = subEx->getNoGroups() - 1; 00876 00877 for(const XMLCh *ptr = repString; *ptr != chNull; ++ptr) { 00878 if(*ptr == chDollarSign) { 00879 ++ptr; 00880 00881 // check that after the $ is a digit 00882 if(!XMLString::isDigit(*ptr)) { 00883 // invalid replace string - $ must be followed by a digit 00884 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager); 00885 } 00886 00887 int index = *ptr - chDigit_0; 00888 00889 const XMLCh *dig = ptr + 1; 00890 while(XMLString::isDigit(*dig)) { 00891 int newIndex = index * 10 + (*dig - chDigit_0); 00892 if(newIndex > numSubExp) break; 00893 00894 index = newIndex; 00895 ptr = dig; 00896 ++dig; 00897 } 00898 00899 // now check that the index is legal 00900 if(index <= numSubExp) { 00901 int start = subEx->getStartPos(index); 00902 int end = subEx->getEndPos(index); 00903 00904 // now copy the substring into the new string 00905 if(start < end) { 00906 result.append(origString + start, end - start); 00907 } 00908 } 00909 00910 } else { 00911 if(*ptr == chBackSlash) { 00912 ++ptr; 00913 00914 // if you have a slash and then a character that's not a $ or /, 00915 // then it's an invalid replace string 00916 if(*ptr != chDollarSign && *ptr != chBackSlash) { 00917 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager); 00918 } 00919 } 00920 00921 result.append(*ptr); 00922 } 00923 } 00924 } 00925 00926 00927 // ----------------------------------------------------------------------- 00928 // Static initialize and cleanup methods 00929 // ----------------------------------------------------------------------- 00930 void 00931 XMLInitializer::initializeRegularExpression() 00932 { 00933 RegularExpression::staticInitialize(XMLPlatformUtils::fgMemoryManager); 00934 } 00935 00936 void 00937 XMLInitializer::terminateRegularExpression() 00938 { 00939 RegularExpression::staticCleanup(); 00940 } 00941 00942 void 00943 RegularExpression::staticInitialize(MemoryManager* memoryManager) 00944 { 00945 fWordRange = TokenFactory::staticGetRange(fgUniIsWord, false); 00946 00947 if (fWordRange == 0) 00948 ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, memoryManager); 00949 } 00950 00951 // --------------------------------------------------------------------------- 00952 // RegularExpression: Helpers methods 00953 // --------------------------------------------------------------------------- 00954 int RegularExpression::getOptionValue(const XMLCh ch) { 00955 00956 int ret = 0; 00957 00958 switch (ch) { 00959 00960 case chLatin_i: 00961 ret = IGNORE_CASE; 00962 break; 00963 case chLatin_m: 00964 ret = MULTIPLE_LINE; 00965 break; 00966 case chLatin_s: 00967 ret = SINGLE_LINE; 00968 break; 00969 case chLatin_x: 00970 ret = EXTENDED_COMMENT; 00971 break; 00972 case chLatin_F: 00973 ret = PROHIBIT_FIXED_STRING_OPTIMIZATION; 00974 break; 00975 case chLatin_H: 00976 ret = PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; 00977 break; 00978 case chLatin_X: 00979 ret = XMLSCHEMA_MODE; 00980 break; 00981 default: 00982 break; 00983 } 00984 00985 return ret; 00986 } 00987 00988 struct RE_RuntimeContext { 00989 const Op *op_; 00990 XMLSize_t offs_; 00991 00992 RE_RuntimeContext(const Op *op, XMLSize_t offs) : op_(op), offs_(offs) { } 00993 }; 00994 00995 int RegularExpression::match(Context* const context, const Op* const operations, 00996 XMLSize_t offset) const 00997 { 00998 ValueStackOf<RE_RuntimeContext>* opStack=NULL; 00999 Janitor<ValueStackOf<RE_RuntimeContext> > janStack(NULL); 01000 if(context->fLimit > 256) 01001 { 01002 opStack=new ValueStackOf<RE_RuntimeContext>(16, context->fMemoryManager); 01003 janStack.reset(opStack); 01004 } 01005 const Op* tmpOp = operations; 01006 bool ignoreCase = isSet(context->fOptions, IGNORE_CASE); 01007 int doReturn; 01008 01009 while (tmpOp != 0) { 01010 // no one wants to return -5, only -1, 0, and greater 01011 doReturn = -5; 01012 01013 if (offset > context->fLimit || offset < context->fStart) 01014 doReturn = -1; 01015 else 01016 { 01017 switch(tmpOp->getOpType()) { 01018 case Op::O_CHAR: 01019 if (!matchChar(context, tmpOp->getData(), offset, ignoreCase)) 01020 doReturn = -1; 01021 else 01022 tmpOp = tmpOp->getNextOp(); 01023 break; 01024 case Op::O_DOT: 01025 if (!matchDot(context, offset)) 01026 doReturn = -1; 01027 else 01028 tmpOp = tmpOp->getNextOp(); 01029 break; 01030 case Op::O_RANGE: 01031 case Op::O_NRANGE: 01032 if (!matchRange(context, tmpOp, offset, ignoreCase)) 01033 doReturn = -1; 01034 else 01035 tmpOp = tmpOp->getNextOp(); 01036 break; 01037 case Op::O_ANCHOR: 01038 if (!matchAnchor(context, tmpOp->getData(), offset)) 01039 doReturn = -1; 01040 else 01041 tmpOp = tmpOp->getNextOp(); 01042 break; 01043 case Op::O_BACKREFERENCE: 01044 if (!matchBackReference(context, tmpOp->getData(), offset, 01045 ignoreCase)) 01046 doReturn = -1; 01047 else 01048 tmpOp = tmpOp->getNextOp(); 01049 break; 01050 case Op::O_STRING: 01051 if (!matchString(context, tmpOp->getLiteral(), offset, ignoreCase)) 01052 doReturn = -1; 01053 else 01054 tmpOp = tmpOp->getNextOp(); 01055 break; 01056 case Op::O_FINITE_CLOSURE: 01057 { 01058 XMLInt32 id = tmpOp->getData(); 01059 // if id is not -1, it's a closure with a child token having a minumum length, 01060 // where id is the index of the fOffsets array where its status is stored 01061 if (id >= 0) { 01062 int prevOffset = context->fOffsets[id]; 01063 if (prevOffset < 0 || prevOffset != (int)offset) { 01064 context->fOffsets[id] = (int)offset; 01065 } 01066 else { 01067 // the status didn't change, we haven't found other copies; move on to the next match 01068 context->fOffsets[id] = -1; 01069 tmpOp = tmpOp->getNextOp(); 01070 break; 01071 } 01072 } 01073 01074 // match the subitems until they do 01075 int ret; 01076 while((ret = match(context, tmpOp->getChild(), offset)) != -1) 01077 { 01078 if(offset == (XMLSize_t)ret) 01079 break; 01080 offset = ret; 01081 } 01082 01083 if (id >= 0) { 01084 // loop has ended, reset the status for this closure 01085 context->fOffsets[id] = -1; 01086 } 01087 tmpOp = tmpOp->getNextOp(); 01088 } 01089 break; 01090 case Op::O_FINITE_NONGREEDYCLOSURE: 01091 { 01092 int ret = match(context,tmpOp->getNextOp(),offset); 01093 if (ret >= 0) 01094 doReturn = ret; 01095 else 01096 { 01097 // match the subitems until they do 01098 int ret; 01099 while((ret = match(context, tmpOp->getChild(), offset)) != -1) 01100 { 01101 if(offset == (XMLSize_t)ret) 01102 break; 01103 offset = ret; 01104 } 01105 tmpOp = tmpOp->getNextOp(); 01106 } 01107 } 01108 break; 01109 case Op::O_CLOSURE: 01110 { 01111 XMLInt32 id = tmpOp->getData(); 01112 // if id is not -1, it's a closure with a child token having a minumum length, 01113 // where id is the index of the fOffsets array where its status is stored 01114 if (id >= 0) { 01115 int prevOffset = context->fOffsets[id]; 01116 if (prevOffset < 0 || prevOffset != (int)offset) { 01117 context->fOffsets[id] = (int)offset; 01118 } 01119 else { 01120 // the status didn't change, we haven't found other copies; move on to the next match 01121 context->fOffsets[id] = -1; 01122 tmpOp = tmpOp->getNextOp(); 01123 break; 01124 } 01125 } 01126 01127 if(opStack!=NULL) 01128 { 01129 opStack->push(RE_RuntimeContext(tmpOp, offset)); 01130 tmpOp = tmpOp->getChild(); 01131 } 01132 else 01133 { 01134 int ret = match(context, tmpOp->getChild(), offset); 01135 if (id >= 0) { 01136 context->fOffsets[id] = -1; 01137 } 01138 if (ret >= 0) 01139 doReturn = ret; 01140 else 01141 tmpOp = tmpOp->getNextOp(); 01142 } 01143 } 01144 break; 01145 case Op::O_QUESTION: 01146 { 01147 if(opStack!=NULL) 01148 { 01149 opStack->push(RE_RuntimeContext(tmpOp, offset)); 01150 tmpOp = tmpOp->getChild(); 01151 } 01152 else 01153 { 01154 int ret = match(context, tmpOp->getChild(), offset); 01155 if (ret >= 0) 01156 doReturn = ret; 01157 else 01158 tmpOp = tmpOp->getNextOp(); 01159 } 01160 } 01161 break; 01162 case Op::O_NONGREEDYCLOSURE: 01163 case Op::O_NONGREEDYQUESTION: 01164 { 01165 int ret = match(context,tmpOp->getNextOp(),offset); 01166 if (ret >= 0) 01167 doReturn = ret; 01168 else 01169 tmpOp = tmpOp->getChild(); 01170 } 01171 break; 01172 case Op::O_UNION: 01173 doReturn = matchUnion(context, tmpOp, offset); 01174 break; 01175 case Op::O_CAPTURE: 01176 if (context->fMatch != 0 && tmpOp->getData() != 0) 01177 doReturn = matchCapture(context, tmpOp, offset); 01178 else 01179 tmpOp = tmpOp->getNextOp(); 01180 break; 01181 } 01182 } 01183 if (doReturn != -5) { 01184 if (opStack==NULL || opStack->size() == 0) 01185 return doReturn; 01186 RE_RuntimeContext ctx = opStack->pop(); 01187 tmpOp = ctx.op_; 01188 offset = ctx.offs_; 01189 if (tmpOp->getOpType() == Op::O_CLOSURE) { 01190 XMLInt32 id = tmpOp->getData(); 01191 if (id >= 0) { 01192 // loop has ended, reset the status for this closure 01193 context->fOffsets[id] = -1; 01194 } 01195 } 01196 if (tmpOp->getOpType() == Op::O_CLOSURE || tmpOp->getOpType() == Op::O_QUESTION) { 01197 if (doReturn >= 0) 01198 return doReturn; 01199 } 01200 tmpOp = tmpOp->getNextOp(); 01201 } 01202 } 01203 01204 return (int)offset; 01205 } 01206 01207 bool RegularExpression::matchChar(Context* const context, 01208 const XMLInt32 ch, XMLSize_t& offset, 01209 const bool ignoreCase) const 01210 { 01211 if (offset >= context->fLimit) 01212 return false; 01213 01214 XMLInt32 strCh = 0; 01215 01216 if (!context->nextCh(strCh, offset)) 01217 return false; 01218 01219 bool match = ignoreCase ? matchIgnoreCase(ch, strCh) 01220 : (ch == strCh); 01221 if (!match) 01222 return false; 01223 01224 ++offset; 01225 01226 return true; 01227 } 01228 01229 bool RegularExpression::matchDot(Context* const context, XMLSize_t& offset) const 01230 { 01231 if (offset >= context->fLimit) 01232 return false; 01233 01234 XMLInt32 strCh = 0; 01235 01236 if (!context->nextCh(strCh, offset)) 01237 return false; 01238 01239 if (!isSet(context->fOptions, SINGLE_LINE)) { 01240 01241 if (RegxUtil::isEOLChar(strCh)) 01242 return false; 01243 } 01244 01245 ++offset; 01246 return true; 01247 } 01248 01249 bool RegularExpression::matchRange(Context* const context, const Op* const op, 01250 XMLSize_t& offset, const bool ignoreCase) const 01251 { 01252 if (offset >= context->fLimit) 01253 return false; 01254 01255 XMLInt32 strCh = 0; 01256 01257 if (!context->nextCh(strCh, offset)) 01258 return false; 01259 01260 RangeToken* tok = (RangeToken *) op->getToken(); 01261 bool match = false; 01262 01263 if (ignoreCase) { 01264 tok = tok->getCaseInsensitiveToken(fTokenFactory); 01265 } 01266 01267 match = tok->match(strCh); 01268 01269 if (!match) 01270 return false; 01271 01272 ++offset; 01273 return true; 01274 } 01275 01276 bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, 01277 const XMLSize_t offset) const 01278 { 01279 switch ((XMLCh) ch) { 01280 case chDollarSign: 01281 if (isSet(context->fOptions, MULTIPLE_LINE)) { 01282 if (!(offset == context->fLimit || (offset < context->fLimit 01283 && RegxUtil::isEOLChar(context->fString[offset])))) 01284 return false; 01285 } 01286 else { 01287 01288 if (!(offset == context->fLimit 01289 || (offset+1 == context->fLimit 01290 && RegxUtil::isEOLChar(context->fString[offset])) 01291 || (offset+2 == context->fLimit 01292 && context->fString[offset] == chCR 01293 && context->fString[offset+1] == chLF))) 01294 return false; 01295 } 01296 break; 01297 case chCaret: 01298 if (!isSet(context->fOptions, MULTIPLE_LINE)) { 01299 01300 if (offset != context->fStart) 01301 return false; 01302 } 01303 else { 01304 01305 if (!(offset == context->fStart || (offset > context->fStart 01306 && RegxUtil::isEOLChar(context->fString[offset-1])))) 01307 return false; 01308 } 01309 break; 01310 } 01311 01312 return true; 01313 } 01314 01315 bool RegularExpression::matchBackReference(Context* const context, 01316 const XMLInt32 refNo, XMLSize_t& offset, 01317 const bool ignoreCase) const 01318 { 01319 if (refNo <=0 || refNo >= fNoGroups) 01320 ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, context->fMemoryManager); 01321 01322 // If the group we're matching against wasn't matched, 01323 // the back reference matches the empty string 01324 if (context->fMatch->getStartPos(refNo) < 0 || context->fMatch->getEndPos(refNo) < 0) 01325 return true; 01326 01327 int start = context->fMatch->getStartPos(refNo); 01328 int length = context->fMatch->getEndPos(refNo) - start; 01329 01330 if (int(context->fLimit - offset) < length) 01331 return false; 01332 01333 bool match = ignoreCase ? XMLString::regionIMatches(context->fString,(int)offset, 01334 context->fString,start,length) 01335 : XMLString::regionMatches(context->fString, (int)offset, 01336 context->fString, start,length); 01337 01338 if (match) offset += length; 01339 return match; 01340 } 01341 01342 bool RegularExpression::matchString(Context* const context, 01343 const XMLCh* const literal, XMLSize_t& offset, 01344 const bool ignoreCase) const 01345 { 01346 XMLSize_t length = XMLString::stringLen(literal); 01347 01348 if (context->fLimit - offset < length) 01349 return false; 01350 01351 bool match = ignoreCase ? XMLString::regionIMatches(context->fString, (int)offset, 01352 literal, 0, length) 01353 : XMLString::regionMatches(context->fString, (int)offset, 01354 literal, 0, length); 01355 if (match) offset += length; 01356 return match; 01357 } 01358 01359 int RegularExpression::matchCapture(Context* const context, const Op* const op, 01360 XMLSize_t offset) const 01361 { 01362 // No check is made for nullness of fMatch as the function is only called if 01363 // fMatch is not null. 01364 XMLInt32 index = op->getData(); 01365 int save = (index > 0) ? context->fMatch->getStartPos(index) 01366 : context->fMatch->getEndPos(-index); 01367 01368 if (index > 0) { 01369 context->fMatch->setStartPos(index, (int)offset); 01370 int ret = match(context, op->getNextOp(), offset); 01371 if (ret < 0) 01372 context->fMatch->setStartPos(index, save); 01373 return ret; 01374 } 01375 01376 context->fMatch->setEndPos(-index, (int)offset); 01377 int ret = match(context, op->getNextOp(), offset); 01378 if (ret < 0) 01379 context->fMatch->setEndPos(-index, save); 01380 return ret; 01381 } 01382 01383 int RegularExpression::matchUnion(Context* const context, 01384 const Op* const op, XMLSize_t offset) const 01385 { 01386 XMLSize_t opSize = op->getSize(); 01387 01388 Context bestResultContext; 01389 int bestResult=-1; 01390 for(XMLSize_t i=0; i < opSize; i++) { 01391 Context tmpContext(context); 01392 int ret = match(&tmpContext, op->elementAt(i), offset); 01393 if (ret >= 0 && (XMLSize_t)ret <= context->fLimit && ret>bestResult) 01394 { 01395 bestResult=ret; 01396 bestResultContext=tmpContext; 01397 // exit early, if we reached the end of the string 01398 if((XMLSize_t)ret == context->fLimit) 01399 break; 01400 } 01401 } 01402 if(bestResult!=-1) 01403 *context=bestResultContext; 01404 return bestResult; 01405 } 01406 01407 01408 int RegularExpression::parseOptions(const XMLCh* const options) 01409 { 01410 01411 if (options == 0) 01412 return 0; 01413 01414 int opts = 0; 01415 XMLSize_t length = XMLString::stringLen(options); 01416 01417 for (XMLSize_t i=0; i < length; i++) { 01418 01419 int v = getOptionValue(options[i]); 01420 01421 if (v == 0) 01422 ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Regex_UnknownOption, options, fMemoryManager); 01423 01424 opts |= v; 01425 } 01426 01427 return opts; 01428 } 01429 01430 void RegularExpression::compile(const Token* const token) { 01431 01432 if (fOperations != 0) 01433 return; 01434 01435 fNoClosures = 0; 01436 fOperations = compile(token, 0, false); 01437 } 01438 01439 Op* RegularExpression::compile(const Token* const token, Op* const next, 01440 const bool reverse) 01441 { 01442 01443 Op* ret = 0; 01444 01445 const Token::tokType tokenType = token->getTokenType(); 01446 01447 switch(tokenType) { 01448 case Token::T_DOT: 01449 ret = fOpFactory.createDotOp(); 01450 ret->setNextOp(next); 01451 break; 01452 case Token::T_CHAR: 01453 ret = fOpFactory.createCharOp(token->getChar()); 01454 ret->setNextOp(next); 01455 break; 01456 case Token::T_ANCHOR: 01457 ret = fOpFactory.createAnchorOp(token->getChar()); 01458 ret->setNextOp(next); 01459 break; 01460 case Token::T_RANGE: 01461 case Token::T_NRANGE: 01462 ret = fOpFactory.createRangeOp(token); 01463 ret->setNextOp(next); 01464 break; 01465 case Token::T_STRING: 01466 ret = fOpFactory.createStringOp(token->getString()); 01467 ret->setNextOp(next); 01468 break; 01469 case Token::T_BACKREFERENCE: 01470 ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); 01471 ret->setNextOp(next); 01472 break; 01473 case Token::T_EMPTY: 01474 ret = next; 01475 break; 01476 case Token::T_CONCAT: 01477 ret = compileConcat(token, next, reverse); 01478 break; 01479 case Token::T_UNION: 01480 ret = compileUnion(token, next, reverse); 01481 break; 01482 case Token::T_CLOSURE: 01483 case Token::T_NONGREEDYCLOSURE: 01484 ret = compileClosure(token, next, reverse, tokenType); 01485 break; 01486 case Token::T_PAREN: 01487 ret = compileParenthesis(token, next, reverse); 01488 break; 01489 default: 01490 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_UnknownTokenType, fMemoryManager); 01491 break; // this line to be deleted 01492 } 01493 01494 return ret; 01495 } 01496 01497 /* 01498 * Prepares for matching. This method is called during construction. 01499 */ 01500 void RegularExpression::prepare() { 01501 01502 compile(fTokenTree); 01503 01504 fMinLength = fTokenTree->getMinLength(); 01505 fFirstChar = 0; 01506 01507 if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) && 01508 !isSet(fOptions, XMLSCHEMA_MODE)) { 01509 01510 RangeToken* rangeTok = fTokenFactory->createRange(); 01511 Token::firstCharacterOptions result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory); 01512 01513 if (result == Token::FC_TERMINAL) { 01514 01515 rangeTok->compactRanges(); 01516 fFirstChar = rangeTok; 01517 } 01518 01519 rangeTok->createMap(); 01520 01521 if (isSet(fOptions, IGNORE_CASE)) 01522 { 01523 rangeTok->getCaseInsensitiveToken(fTokenFactory); 01524 } 01525 } 01526 01527 if (fOperations != 0 && fOperations->getNextOp() == 0 && 01528 (fOperations->getOpType() == Op::O_STRING || 01529 fOperations->getOpType() == Op::O_CHAR) && 01530 !isSet(fOptions, IGNORE_CASE) ) { 01531 01532 fFixedStringOnly = true; 01533 01534 if (fOperations->getOpType() == Op::O_STRING) { 01535 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; 01536 fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager); 01537 } 01538 else{ 01539 01540 XMLInt32 ch = fOperations->getData(); 01541 01542 if ( ch >= 0x10000) { // add as constant 01543 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; 01544 fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager); 01545 } 01546 else { 01547 01548 XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2]; 01549 dummyStr[0] = (XMLCh) fOperations->getData(); 01550 dummyStr[1] = chNull; 01551 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; 01552 fFixedString = dummyStr; 01553 } 01554 } 01555 01556 fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, 01557 isSet(fOptions, IGNORE_CASE), fMemoryManager); 01558 } 01559 else if (!isSet(fOptions, XMLSCHEMA_MODE) && 01560 !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION) && 01561 !isSet(fOptions, IGNORE_CASE)) { 01562 01563 int fixedOpts = 0; 01564 Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts); 01565 01566 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; 01567 01568 fFixedString = (tok == 0) ? 0 01569 : XMLString::replicate(tok->getString(), fMemoryManager); 01570 01571 if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) { 01572 01573 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; 01574 fFixedString = 0; 01575 } 01576 01577 if (fFixedString != 0) { 01578 01579 fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256, 01580 isSet(fixedOpts, IGNORE_CASE), fMemoryManager); 01581 } 01582 } 01583 } 01584 01585 bool RegularExpression::doTokenOverlap(const Op* op, Token* token) 01586 { 01587 if(op->getOpType()==Op::O_RANGE) 01588 { 01589 RangeToken* t1=(RangeToken*)op->getToken(); 01590 switch(token->getTokenType()) 01591 { 01592 case Token::T_CHAR: 01593 return t1->match(token->getChar()); 01594 case Token::T_STRING: 01595 return t1->match(*token->getString()); 01596 case Token::T_RANGE: 01597 { 01598 try 01599 { 01600 RangeToken tempRange(Token::T_RANGE, fMemoryManager); 01601 tempRange.mergeRanges(t1); 01602 tempRange.intersectRanges((RangeToken*)token); 01603 return !tempRange.empty(); 01604 } 01605 catch(RuntimeException&) 01606 { 01607 } 01608 break; 01609 } 01610 default: 01611 break; 01612 } 01613 return true; 01614 } 01615 01616 XMLInt32 ch=0; 01617 if(op->getOpType()==Op::O_CHAR) 01618 ch=op->getData(); 01619 else if(op->getOpType()==Op::O_STRING) 01620 ch=*op->getLiteral(); 01621 01622 if(ch!=0) 01623 { 01624 switch(token->getTokenType()) 01625 { 01626 case Token::T_CHAR: 01627 return token->getChar()==ch; 01628 case Token::T_STRING: 01629 return *token->getString()==ch; 01630 case Token::T_RANGE: 01631 case Token::T_NRANGE: 01632 return ((RangeToken*)token)->match(ch); 01633 default: 01634 break; 01635 } 01636 } 01637 // in any other case, there is the chance that they overlap 01638 return true; 01639 } 01640 01641 XERCES_CPP_NAMESPACE_END 01642