GME  13
RegularExpression.cpp
Go to the documentation of this file.
00001 /*
00002  * Licensed to the Apache Software Foundation (ASF) under one or more
00003  * contributor license agreements.  See the NOTICE file distributed with
00004  * this work for additional information regarding copyright ownership.
00005  * The ASF licenses this file to You under the Apache License, Version 2.0
00006  * (the "License"); you may not use this file except in compliance with
00007  * the License.  You may obtain a copy of the License at
00008  *
00009  *      http://www.apache.org/licenses/LICENSE-2.0
00010  *
00011  * Unless required by applicable law or agreed to in writing, software
00012  * distributed under the License is distributed on an "AS IS" BASIS,
00013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014  * See the License for the specific language governing permissions and
00015  * limitations under the License.
00016  */
00017 
00018 /*
00019  * $Id: RegularExpression.cpp 822158 2009-10-06 07:52:59Z amassari $
00020  */
00021 
00022 // ---------------------------------------------------------------------------
00023 //  Includes
00024 // ---------------------------------------------------------------------------
00025 #include <xercesc/util/regx/RegularExpression.hpp>
00026 #include <xercesc/util/PlatformUtils.hpp>
00027 #include <xercesc/util/regx/Match.hpp>
00028 #include <xercesc/util/regx/RangeToken.hpp>
00029 #include <xercesc/util/regx/RegxDefs.hpp>
00030 #include <xercesc/util/regx/XMLUniCharacter.hpp>
00031 #include <xercesc/util/regx/ParserForXMLSchema.hpp>
00032 #include <xercesc/util/Janitor.hpp>
00033 #include <xercesc/util/ParseException.hpp>
00034 #include <xercesc/util/IllegalArgumentException.hpp>
00035 #include <xercesc/framework/XMLBuffer.hpp>
00036 #include <xercesc/util/OutOfMemoryException.hpp>
00037 #include <xercesc/util/XMLInitializer.hpp>
00038 #include <xercesc/util/XMLUniDefs.hpp>
00039 #include <xercesc/util/ValueStackOf.hpp>
00040 
00041 XERCES_CPP_NAMESPACE_BEGIN
00042 
00043 // ---------------------------------------------------------------------------
00044 //  Static member data initialization
00045 // ---------------------------------------------------------------------------
00046 const unsigned int RegularExpression::IGNORE_CASE = 2;
00047 const unsigned int RegularExpression::SINGLE_LINE = 4;
00048 const unsigned int RegularExpression::MULTIPLE_LINE = 8;
00049 const unsigned int RegularExpression::EXTENDED_COMMENT = 16;
00050 const unsigned int RegularExpression::PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 128;
00051 const unsigned int RegularExpression::PROHIBIT_FIXED_STRING_OPTIMIZATION = 256;
00052 const unsigned int RegularExpression::XMLSCHEMA_MODE = 512;
00053 RangeToken*        RegularExpression::fWordRange = 0;
00054 
00055 bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
00056                                         const XMLInt32 ch2) const
00057 {
00058     if (ch1 >= 0x10000)
00059     {
00060         XMLCh string1[2];
00061         XMLCh string2[2];
00062 
00063         RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]);
00064 
00065         if (ch2 >= 0x10000)
00066         {
00067             RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
00068         }
00069         else
00070         {
00071             // XMLString::compareNIString is broken, because it assume the
00072             // two strings must be of the same length.  Note that two strings
00073             // of different length could compare as equal, because there is no
00074             // guarantee that a Unicode code point that is encoded in UTF-16 as
00075             // a surrogate pair does not have a case mapping to a code point
00076             // that is not in the surrogate range.  Just to be safe, we pad the
00077             // shorter string with a space, which cannot hvae a case mapping.
00078             string2[0] = (XMLCh)ch2;
00079             string2[1] = chSpace;
00080         }
00081 
00082         return (0==XMLString::compareNIString(string1, string2, 2));
00083     }
00084     else if (ch2 >= 0x10000)
00085     {
00086         const XMLCh string1[2] = { (XMLCh)ch1, chSpace };
00087         XMLCh string2[2];
00088 
00089         RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
00090 
00091         return (0==XMLString::compareNIString(string1, string2, 2));
00092     }
00093     else
00094     {
00095         const XMLCh  char1 = (XMLCh)ch1;
00096         const XMLCh  char2 = (XMLCh)ch2;
00097 
00098         return (0==XMLString::compareNIString(&char1, &char2, 1));
00099     }
00100 }
00101 
00102 
00103 
00104 // ---------------------------------------------------------------------------
00105 //  RegularExpression::Context: Constructors and Destructor
00106 // ---------------------------------------------------------------------------
00107 RegularExpression::Context::Context(MemoryManager* const manager) :
00108     fAdoptMatch(false)
00109     , fStart(0)
00110     , fLimit(0)
00111     , fLength(0)
00112     , fSize(0)
00113     , fStringMaxLen(0)
00114     , fOffsets(0)
00115     , fMatch(0)
00116     , fString(0)
00117     , fOptions(0)
00118     , fMemoryManager(manager)
00119 {
00120 }
00121 
00122 RegularExpression::Context::Context(Context* src) :
00123     fAdoptMatch(false)
00124     , fStart(src->fStart)
00125     , fLimit(src->fLimit)
00126     , fLength(src->fLength)
00127     , fSize(src->fSize)
00128     , fStringMaxLen(src->fStringMaxLen)
00129     , fOffsets(0)
00130     , fMatch(0)
00131     , fString(src->fString)
00132     , fOptions(src->fOptions)
00133     , fMemoryManager(src->fMemoryManager)
00134 {
00135     if(src->fOffsets)
00136     {
00137         fOffsets = (int*) fMemoryManager->allocate(fSize* sizeof(int));
00138         for (int i = 0; i< fSize; i++)
00139             fOffsets[i] = src->fOffsets[i];
00140     }
00141     if(src->fMatch)
00142     {
00143         fMatch=new (fMemoryManager) Match(*src->fMatch);
00144         fAdoptMatch=true;
00145     }
00146 }
00147 
00148 RegularExpression::Context& RegularExpression::Context::operator=(const RegularExpression::Context& other)
00149 {
00150     if (this != &other)
00151     {
00152         fStart=other.fStart;
00153         fLimit=other.fLimit;
00154         fLength=other.fLength;
00155         fStringMaxLen=other.fStringMaxLen;
00156         fString=other.fString;
00157         fOptions=other.fOptions;
00158 
00159         // if offset and match are already allocated with the right size, reuse them 
00160         // (fMatch can be provided by the user to get the data back)
00161         if(fMatch && other.fMatch && fMatch->getNoGroups()==other.fMatch->getNoGroups())
00162             *fMatch=*other.fMatch;
00163         else
00164         {
00165             if (fAdoptMatch)
00166                 delete fMatch;
00167             fMatch=0;
00168             if(other.fMatch)
00169             {
00170                 fMatch=new (other.fMemoryManager) Match(*other.fMatch);
00171                 fAdoptMatch=true;
00172             }
00173         }
00174 
00175         if (fOffsets && other.fOffsets && fSize==other.fSize)
00176         {
00177             for (int i = 0; i< fSize; i++)
00178                 fOffsets[i] = other.fOffsets[i];
00179         }
00180         else
00181         {
00182             if(fOffsets)
00183                 fMemoryManager->deallocate(fOffsets);//delete [] fOffsets;
00184             fOffsets=0;
00185             fSize=other.fSize;
00186             if(other.fOffsets)
00187             {
00188                 fOffsets = (int*) other.fMemoryManager->allocate(fSize* sizeof(int));
00189                 for (int i = 0; i< fSize; i++)
00190                     fOffsets[i] = other.fOffsets[i];
00191             }
00192         }
00193 
00194         fMemoryManager=other.fMemoryManager;
00195     }
00196 
00197     return *this;
00198 }
00199 
00200 RegularExpression::Context::~Context()
00201 {
00202     if (fOffsets)
00203         fMemoryManager->deallocate(fOffsets);//delete [] fOffsets;
00204 
00205     if (fAdoptMatch)
00206         delete fMatch;
00207 }
00208 
00209 // ---------------------------------------------------------------------------
00210 //  RegularExpression::Context: Public methods
00211 // ---------------------------------------------------------------------------
00212 void RegularExpression::Context::reset(const XMLCh* const string
00213                                        , const XMLSize_t stringLen
00214                                        , const XMLSize_t start
00215                                        , const XMLSize_t limit
00216                                        , const int noClosures
00217                                        , const unsigned int options)
00218 {
00219     fString = string;
00220     fStringMaxLen = stringLen;
00221     fStart = start;
00222     fLimit = limit;
00223     fLength = fLimit - fStart;
00224     if (fAdoptMatch)
00225         delete fMatch;
00226     fMatch = 0;
00227 
00228     if (fSize != noClosures) {
00229         if (fOffsets)
00230             fMemoryManager->deallocate(fOffsets);//delete [] fOffsets;
00231         fOffsets = (int*) fMemoryManager->allocate(noClosures * sizeof(int));//new int[noClosures];
00232     }
00233 
00234     fSize = noClosures;
00235     fOptions = options;
00236 
00237     for (int i = 0; i< fSize; i++)
00238         fOffsets[i] = -1;
00239 }
00240 
00241 bool RegularExpression::Context::nextCh(XMLInt32& ch, XMLSize_t& offset)
00242 {
00243     ch = fString[offset];
00244 
00245     if (RegxUtil::isHighSurrogate(ch)) {
00246         if ((offset + 1 < fLimit) && RegxUtil::isLowSurrogate(fString[offset+1])) {
00247             ch = RegxUtil::composeFromSurrogate(ch, fString[++offset]);
00248         }
00249         else return false;
00250     }
00251     else if (RegxUtil::isLowSurrogate(ch)) {
00252         return false;
00253     }
00254 
00255     return true;
00256 }
00257 
00258 // ---------------------------------------------------------------------------
00259 //  RegularExpression: Constructors and Destructors
00260 // ---------------------------------------------------------------------------
00261 
00262 typedef JanitorMemFunCall<RegularExpression>    CleanupType;
00263 
00264 RegularExpression::RegularExpression(const char* const pattern,
00265                                      MemoryManager* const manager)
00266     :fHasBackReferences(false),
00267      fFixedStringOnly(false),
00268      fNoGroups(0),
00269      fMinLength(0),
00270      fNoClosures(0),
00271      fOptions(0),
00272      fBMPattern(0),
00273      fPattern(0),
00274      fFixedString(0),
00275      fOperations(0),
00276      fTokenTree(0),
00277      fFirstChar(0),
00278      fOpFactory(manager),
00279      fTokenFactory(0),
00280      fMemoryManager(manager)
00281 {
00282     CleanupType cleanup(this, &RegularExpression::cleanUp);
00283 
00284     try {
00285 
00286         XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager);
00287         ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
00288         setPattern(tmpBuf);
00289     }
00290     catch(const OutOfMemoryException&)
00291     {
00292         cleanup.release();
00293 
00294         throw;
00295     }
00296 
00297     cleanup.release();
00298 }
00299 
00300 RegularExpression::RegularExpression(const char* const pattern,
00301                                      const char* const options,
00302                                      MemoryManager* const manager)
00303     :fHasBackReferences(false),
00304      fFixedStringOnly(false),
00305      fNoGroups(0),
00306      fMinLength(0),
00307      fNoClosures(0),
00308      fOptions(0),
00309      fBMPattern(0),
00310      fPattern(0),
00311      fFixedString(0),
00312      fOperations(0),
00313      fTokenTree(0),
00314      fFirstChar(0),
00315      fOpFactory(manager),
00316      fTokenFactory(0),
00317      fMemoryManager(manager)
00318 {
00319     CleanupType cleanup(this, &RegularExpression::cleanUp);
00320 
00321     try {
00322 
00323         XMLCh* tmpBuf = XMLString::transcode(pattern, fMemoryManager);
00324         ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
00325         XMLCh* tmpOptions = XMLString::transcode(options, fMemoryManager);
00326         ArrayJanitor<XMLCh> janOps(tmpOptions, fMemoryManager);
00327         setPattern(tmpBuf, tmpOptions);
00328     }
00329     catch(const OutOfMemoryException&)
00330     {
00331         cleanup.release();
00332 
00333         throw;
00334     }
00335 
00336     cleanup.release();
00337 }
00338 
00339 
00340 RegularExpression::RegularExpression(const XMLCh* const pattern,
00341                                      MemoryManager* const manager)
00342     :fHasBackReferences(false),
00343      fFixedStringOnly(false),
00344      fNoGroups(0),
00345      fMinLength(0),
00346      fNoClosures(0),
00347      fOptions(0),
00348      fBMPattern(0),
00349      fPattern(0),
00350      fFixedString(0),
00351      fOperations(0),
00352      fTokenTree(0),
00353      fFirstChar(0),
00354      fOpFactory(manager),
00355      fTokenFactory(0),
00356      fMemoryManager(manager)
00357 {
00358     CleanupType cleanup(this, &RegularExpression::cleanUp);
00359 
00360     try {
00361 
00362         setPattern(pattern);
00363     }
00364     catch(const OutOfMemoryException&)
00365     {
00366         cleanup.release();
00367 
00368         throw;
00369     }
00370 
00371     cleanup.release();
00372 }
00373 
00374 RegularExpression::RegularExpression(const XMLCh* const pattern,
00375                                      const XMLCh* const options,
00376                                      MemoryManager* const manager)
00377     :fHasBackReferences(false),
00378      fFixedStringOnly(false),
00379      fNoGroups(0),
00380      fMinLength(0),
00381      fNoClosures(0),
00382      fOptions(0),
00383      fBMPattern(0),
00384      fPattern(0),
00385      fFixedString(0),
00386      fOperations(0),
00387      fTokenTree(0),
00388      fFirstChar(0),
00389      fOpFactory(manager),
00390      fTokenFactory(0),
00391      fMemoryManager(manager)
00392 {
00393     CleanupType cleanup(this, &RegularExpression::cleanUp);
00394 
00395     try {
00396 
00397         setPattern(pattern, options);
00398     }
00399     catch(const OutOfMemoryException&)
00400     {
00401         cleanup.release();
00402 
00403         throw;
00404     }
00405 
00406     cleanup.release();
00407 }
00408 
00409 RegularExpression::~RegularExpression() {
00410 
00411     cleanUp();
00412 }
00413 
00414 // ---------------------------------------------------------------------------
00415 //  RegularExpression: Setter methods
00416 // ---------------------------------------------------------------------------
00417 
00418 RegxParser* RegularExpression::getRegexParser(const int options, MemoryManager* const manager)
00419 {
00420     // the following construct causes an error in an Intel 7.1 32 bit compiler for
00421     // red hat linux 7.2
00422     // (when an exception is thrown the wrong object is deleted)
00423     //RegxParser* regxParser = isSet(fOptions, XMLSCHEMA_MODE)
00424     //    ? new (fMemoryManager) ParserForXMLSchema(fMemoryManager)
00425     //    : new (fMemoryManager) RegxParser(fMemoryManager);
00426     if (isSet(options, XMLSCHEMA_MODE))
00427         return new (manager) ParserForXMLSchema(manager);
00428 
00429     return new (manager) RegxParser(manager);
00430 }
00431 
00432 void RegularExpression::setPattern(const XMLCh* const pattern,
00433                                    const XMLCh* const options)
00434 {
00435 
00436     fTokenFactory = new (fMemoryManager) TokenFactory(fMemoryManager);
00437     fOptions = parseOptions(options);
00438     fPattern = XMLString::replicate(pattern, fMemoryManager);
00439 
00440     RegxParser* regxParser=getRegexParser(fOptions, fMemoryManager);
00441 
00442     if (regxParser)
00443         regxParser->setTokenFactory(fTokenFactory);
00444 
00445     Janitor<RegxParser> janRegxParser(regxParser);
00446     fTokenTree = regxParser->parse(fPattern, fOptions);
00447     fNoGroups = regxParser->getNoParen();
00448     fHasBackReferences = regxParser->hasBackReferences();
00449 
00450     prepare();
00451 }
00452 
00453 // ---------------------------------------------------------------------------
00454 //  RegularExpression: Matching methods
00455 // ---------------------------------------------------------------------------
00456 bool RegularExpression::matches(const char* const expression
00457                                 , MemoryManager* const manager) const
00458 {
00459     XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00460     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00461     return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), 0, manager);
00462 }
00463 
00464 bool RegularExpression::matches(const char* const expression
00465                                 , const XMLSize_t start, const XMLSize_t end
00466                                 , MemoryManager* const manager) const
00467 {
00468 
00469     XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00470     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00471     return matches(tmpBuf, start, end, 0, manager);
00472 }
00473 
00474 bool RegularExpression::matches(const char* const expression
00475                                 , Match* const match
00476                                 , MemoryManager* const manager) const
00477 {
00478 
00479     XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00480     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00481     return matches(tmpBuf, 0, XMLString::stringLen(tmpBuf), match, manager);
00482 }
00483 
00484 bool RegularExpression::matches(const char* const expression, const XMLSize_t start
00485                                 , const XMLSize_t end, Match* const pMatch
00486                                 , MemoryManager* const manager) const
00487 {
00488 
00489     XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00490     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00491     return matches(tmpBuf, start, end, pMatch, manager);
00492 }
00493 
00494 
00495 // ---------------------------------------------------------------------------
00496 //  RegularExpression: Matching methods - Wide char version
00497 // ---------------------------------------------------------------------------
00498 bool RegularExpression::matches(const XMLCh* const expression, MemoryManager* const manager) const
00499 {
00500     return matches(expression, 0, XMLString::stringLen(expression), 0, manager);
00501 }
00502 
00503 bool RegularExpression::matches(const XMLCh* const expression
00504                                 , const XMLSize_t start, const XMLSize_t end
00505                                 , MemoryManager* const manager) const
00506 {
00507     return matches(expression, start, end, 0, manager);
00508 }
00509 
00510 bool RegularExpression::matches(const XMLCh* const expression
00511                                 , Match* const match
00512                                 , MemoryManager* const manager) const
00513 {
00514     return matches(expression, 0, XMLString::stringLen(expression), match, manager);
00515 }
00516 
00517 bool RegularExpression::matches(const XMLCh* const expression, const XMLSize_t start
00518                                 , const XMLSize_t end, Match* const pMatch
00519                                 , MemoryManager* const manager) const
00520 {
00521 
00522     Context context(manager);
00523     XMLSize_t strLength = XMLString::stringLen(expression);
00524 
00525     context.reset(expression, strLength, start, end, fNoClosures, fOptions);
00526 
00527     bool adoptMatch = false;
00528     Match* lMatch = pMatch;
00529 
00530     if (lMatch != 0) {
00531         lMatch->setNoGroups(fNoGroups);
00532     }
00533     else if (fHasBackReferences) {
00534         lMatch = new (manager) Match(manager);
00535         lMatch->setNoGroups(fNoGroups);
00536         adoptMatch = true;
00537     }
00538 
00539     if (context.fAdoptMatch)
00540         delete context.fMatch;
00541     context.fMatch = lMatch;
00542     context.fAdoptMatch = adoptMatch;
00543 
00544     if (isSet(fOptions, XMLSCHEMA_MODE)) {
00545 
00546         int matchEnd = match(&context, fOperations, context.fStart);
00547 
00548         if (matchEnd == (int)context.fLimit) {
00549 
00550             if (context.fMatch != 0) {
00551 
00552                 context.fMatch->setStartPos(0, (int)context.fStart);
00553                 context.fMatch->setEndPos(0, matchEnd);
00554             }
00555             return true;
00556         }
00557 
00558         return false;
00559     }
00560 
00561     /*
00562      * If the pattern has only fixed string, use Boyer-Moore
00563      */
00564     if (fFixedStringOnly) {
00565 
00566         int ret = fBMPattern->matches(expression, context.fStart, context.fLimit);
00567         if (ret >= 0) {
00568 
00569             if (context.fMatch != 0) {
00570                 context.fMatch->setStartPos(0, ret);
00571                 context.fMatch->setEndPos(0, (int)(ret + XMLString::stringLen(fPattern)));
00572             }
00573             return true;
00574         }
00575         return false;
00576     }
00577 
00578     /*
00579      * If the pattern contains a fixed string, we check with Boyer-Moore
00580      * whether the text contains the fixed string or not. If not found
00581      * return false
00582      */
00583     if (fFixedString != 0) {
00584 
00585         int ret = fBMPattern->matches(expression, context.fStart, context.fLimit);
00586 
00587         if (ret < 0) { // No match
00588             return false;
00589         }
00590     }
00591 
00592     // if the length is less than the minimum length, we cannot possibly match
00593     if(context.fLimit<fMinLength)
00594         return false;
00595 
00596     XMLSize_t limit = context.fLimit - fMinLength;
00597     XMLSize_t matchStart;
00598     int matchEnd = -1;
00599 
00600     /*
00601      * Check whether the expression start with ".*"
00602      */
00603     if (fOperations != 0 && (fOperations->getOpType() == Op::O_CLOSURE || fOperations->getOpType() == Op::O_FINITE_CLOSURE)
00604         && fOperations->getChild()->getOpType() == Op::O_DOT) {
00605 
00606         if (isSet(fOptions, SINGLE_LINE)) {
00607             matchStart = context.fStart;
00608             matchEnd = match(&context, fOperations, matchStart);
00609         }
00610         else {
00611             bool previousIsEOL = true;
00612 
00613             for (matchStart=context.fStart; matchStart<=limit; matchStart++) {
00614 
00615                 XMLCh ch = expression[matchStart];
00616                 if (RegxUtil::isEOLChar(ch)) {
00617                     previousIsEOL = true;
00618                 }
00619                 else {
00620 
00621                     if (previousIsEOL) {
00622                         if (0 <= (matchEnd = match(&context, fOperations,
00623                                                    matchStart)))
00624                             break;
00625                     }
00626 
00627                     previousIsEOL = false;
00628                 }
00629             }
00630         }
00631     }
00632     else {
00633         /*
00634          *    Optimization against the first char
00635          */
00636         if (fFirstChar != 0) {
00637             bool ignoreCase = isSet(fOptions, IGNORE_CASE);
00638             RangeToken* range = fFirstChar;
00639 
00640             if (ignoreCase)
00641                 range = fFirstChar->getCaseInsensitiveToken(fTokenFactory);
00642 
00643             for (matchStart=context.fStart; matchStart<=limit; matchStart++) {
00644 
00645                 XMLInt32 ch;
00646 
00647                 if (!context.nextCh(ch, matchStart))
00648                     break;
00649 
00650                 if (!range->match(ch))
00651                     continue;
00652 
00653                 if (0 <= (matchEnd = match(&context,fOperations,matchStart)))
00654                     break;
00655             }
00656         }
00657         else {
00658 
00659             /*
00660              *    Straightforward matching
00661              */
00662             for (matchStart=context.fStart; matchStart<=limit; matchStart++) {
00663 
00664                 if (0 <= (matchEnd = match(&context,fOperations,matchStart)))
00665                     break;
00666             }
00667         }
00668     }
00669 
00670     if (matchEnd >= 0) {
00671 
00672         if (context.fMatch != 0) {
00673 
00674             context.fMatch->setStartPos(0, (int)matchStart);
00675             context.fMatch->setEndPos(0, matchEnd);
00676         }
00677         return true;
00678     }
00679     return false;
00680 }
00681 
00682 // ---------------------------------------------------------------------------
00683 //  RegularExpression: Tokenize methods
00684 // ---------------------------------------------------------------------------
00685 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression,
00686                                                      MemoryManager* const manager) const
00687 {
00688 
00689   XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00690   ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00691   return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf), manager);
00692 }
00693 
00694 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression,
00695                                                      const XMLSize_t start, const XMLSize_t end,
00696                                                      MemoryManager* const manager) const
00697 {
00698 
00699   XMLCh* tmpBuf = XMLString::transcode(expression, manager);
00700   ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00701   return tokenize(tmpBuf, start, end, manager);
00702 }
00703 
00704 
00705 
00706 // ---------------------------------------------------------------------------
00707 //  RegularExpression: Tokenize methods - Wide char version
00708 // ---------------------------------------------------------------------------
00709 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression,
00710                                                      MemoryManager* const manager) const
00711 {
00712     return tokenize(expression, 0, XMLString::stringLen(expression), manager);
00713 }
00714 
00715 RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const matchString,
00716                                                      const XMLSize_t start, const XMLSize_t end,
00717                                                      MemoryManager* const manager) const
00718 {
00719     // check if matches zero length string - throw error if so
00720     if(matches(XMLUni::fgZeroLenString, manager)){
00721         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager);
00722     }
00723 
00724     RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager);
00725     Janitor<RefVectorOf<Match> > janSubEx(subEx);
00726 
00727     allMatches(matchString, start, end, subEx, manager);
00728 
00729     RefArrayVectorOf<XMLCh> *tokens = new (manager) RefArrayVectorOf<XMLCh>(16, true, manager);
00730     XMLSize_t tokStart = start;
00731 
00732     XMLSize_t i = 0;
00733     for(; i < subEx->size(); ++i) {
00734         Match *match = subEx->elementAt(i);
00735         XMLSize_t matchStart = match->getStartPos(0);
00736 
00737         XMLCh *token = (XMLCh*)manager->allocate((matchStart + 1 - tokStart) * sizeof(XMLCh));
00738         XMLString::subString(token, matchString, tokStart, matchStart, manager);
00739         tokens->addElement(token);
00740 
00741         tokStart = match->getEndPos(0);
00742     }
00743 
00744     XMLCh *token = (XMLCh*)manager->allocate((end + 1 - tokStart) * sizeof(XMLCh));
00745     XMLString::subString(token, matchString, tokStart, end, manager);
00746     tokens->addElement(token);
00747 
00748     return tokens;
00749 }
00750 
00751 void RegularExpression::allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end,
00752                                    RefVectorOf<Match> *subEx, MemoryManager* const manager) const
00753 {
00754     Context context(manager);
00755     context.reset(matchString, XMLString::stringLen(matchString), start, end, fNoClosures, fOptions);
00756 
00757     context.fMatch = new (manager) Match(manager);
00758     context.fMatch->setNoGroups(fNoGroups);
00759     context.fAdoptMatch = true;
00760 
00761     XMLSize_t matchStart = start;
00762     while(matchStart <= end) {
00763         XMLSize_t matchEnd = match(&context, fOperations, matchStart);
00764         if(matchEnd != (XMLSize_t)-1) {
00765             context.fMatch->setStartPos(0, (int)matchStart);
00766             context.fMatch->setEndPos(0, (int)matchEnd);
00767 
00768             subEx->addElement(context.fMatch);
00769 
00770             context.fMatch = new (manager) Match(*(context.fMatch));
00771             context.fAdoptMatch = true;
00772 
00773             matchStart = matchEnd;
00774         } else {
00775             ++matchStart;
00776         }
00777     }
00778 }
00779 
00780 
00781 // -----------------------------------------------------------------------
00782 //  RegularExpression: Replace methods
00783 // -----------------------------------------------------------------------
00784 XMLCh* RegularExpression::replace(const char* const matchString,
00785                                   const char* const replaceString,
00786                                   MemoryManager* const manager) const
00787 {
00788 
00789     XMLCh* tmpBuf = XMLString::transcode(matchString, manager);
00790     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00791     XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager);
00792     ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager);
00793 
00794     return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf), manager);
00795 }
00796 
00797 XMLCh* RegularExpression::replace(const char* const matchString,
00798                                   const char* const replaceString,
00799                                   const XMLSize_t start, const XMLSize_t end,
00800                                   MemoryManager* const manager) const
00801 {
00802 
00803     XMLCh* tmpBuf = XMLString::transcode(matchString, manager);
00804     ArrayJanitor<XMLCh> janBuf(tmpBuf, manager);
00805     XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager);
00806     ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager);
00807 
00808     return replace(tmpBuf, tmpBuf2, start, end, manager);
00809 }
00810 
00811 
00812 // ---------------------------------------------------------------------------
00813 //  RegularExpression: Replace methods - Wide char version
00814 // ---------------------------------------------------------------------------
00815 XMLCh* RegularExpression::replace(const XMLCh* const matchString,
00816                                   const XMLCh* const replaceString,
00817                                   MemoryManager* const manager) const
00818 {
00819 
00820     return replace(matchString, replaceString, 0,
00821                    XMLString::stringLen(matchString), manager);
00822 }
00823 
00824 XMLCh* RegularExpression::replace(const XMLCh* const matchString,
00825                                   const XMLCh* const replaceString,
00826                                   const XMLSize_t start, const XMLSize_t end,
00827                                   MemoryManager* const manager) const
00828 {
00829     // check if matches zero length string - throw error if so
00830     if(matches(XMLUni::fgZeroLenString, manager)){
00831         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager);
00832     }
00833 
00834     RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager);
00835     Janitor<RefVectorOf<Match> > janSubEx(subEx);
00836 
00837     allMatches(matchString, start, end, subEx, manager);
00838 
00839     XMLBuffer result(1023, manager);
00840     int tokStart = (int)start;
00841 
00842     XMLSize_t i = 0;
00843     for(; i < subEx->size(); ++i) {
00844         Match *match = subEx->elementAt(i);
00845         int matchStart = match->getStartPos(0);
00846 
00847         if(matchStart > tokStart)
00848             result.append(matchString + tokStart, matchStart - tokStart);
00849         subInExp(replaceString, matchString, match, result, manager);
00850 
00851         tokStart = match->getEndPos(0);
00852     }
00853 
00854     if(end > (XMLSize_t)tokStart)
00855         result.append(matchString + tokStart, end - tokStart);
00856 
00857     return XMLString::replicate(result.getRawBuffer(), manager);
00858 }
00859 
00860 /*
00861  * Helper for Replace. This method prepares the replacement string by substituting
00862  * in actual values for parenthesized sub expressions.
00863  *
00864  * An error will be thrown if:
00865  *  1) there is chBackSlash not followed by a chDollarSign or chBackSlash
00866  *  2) there is an unescaped chDollarSign which is not followed by a digit
00867  *
00868  */
00869 void RegularExpression::subInExp(const XMLCh* const repString,
00870                                  const XMLCh* const origString,
00871                                  const Match* subEx,
00872                                  XMLBuffer &result,
00873                                  MemoryManager* const manager) const
00874 {
00875     int numSubExp = subEx->getNoGroups() - 1;
00876 
00877     for(const XMLCh *ptr = repString; *ptr != chNull; ++ptr) {
00878         if(*ptr == chDollarSign) {
00879             ++ptr;
00880 
00881             // check that after the $ is a digit
00882             if(!XMLString::isDigit(*ptr)) {
00883                 // invalid replace string - $ must be followed by a digit
00884                 ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager);
00885             }
00886 
00887             int index = *ptr - chDigit_0;
00888 
00889             const XMLCh *dig = ptr + 1;
00890             while(XMLString::isDigit(*dig)) {
00891                 int newIndex = index * 10 + (*dig - chDigit_0);
00892                 if(newIndex > numSubExp) break;
00893 
00894                 index = newIndex;
00895                 ptr = dig;
00896                 ++dig;
00897             }
00898 
00899             // now check that the index is legal
00900             if(index <= numSubExp) {
00901                 int start = subEx->getStartPos(index);
00902                 int end = subEx->getEndPos(index);
00903 
00904                 // now copy the substring into the new string
00905                 if(start < end) {
00906                     result.append(origString + start, end - start);
00907                 }
00908             }
00909 
00910         } else {
00911             if(*ptr == chBackSlash) {
00912                 ++ptr;
00913 
00914                 // if you have a slash and then a character that's not a $ or /,
00915                 // then it's an invalid replace string
00916                 if(*ptr != chDollarSign && *ptr != chBackSlash) {
00917                     ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager);
00918                 }
00919             }
00920 
00921             result.append(*ptr);
00922         }
00923     }
00924 }
00925 
00926 
00927 // -----------------------------------------------------------------------
00928 //  Static initialize and cleanup methods
00929 // -----------------------------------------------------------------------
00930 void
00931 XMLInitializer::initializeRegularExpression()
00932 {
00933     RegularExpression::staticInitialize(XMLPlatformUtils::fgMemoryManager);
00934 }
00935 
00936 void
00937 XMLInitializer::terminateRegularExpression()
00938 {
00939     RegularExpression::staticCleanup();
00940 }
00941 
00942 void
00943 RegularExpression::staticInitialize(MemoryManager* memoryManager)
00944 {
00945     fWordRange = TokenFactory::staticGetRange(fgUniIsWord, false);
00946 
00947     if (fWordRange == 0)
00948         ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, memoryManager);
00949 }
00950 
00951 // ---------------------------------------------------------------------------
00952 //  RegularExpression: Helpers methods
00953 // ---------------------------------------------------------------------------
00954 int RegularExpression::getOptionValue(const XMLCh ch) {
00955 
00956     int ret = 0;
00957 
00958     switch (ch) {
00959 
00960         case chLatin_i:
00961             ret = IGNORE_CASE;
00962             break;
00963         case chLatin_m:
00964             ret = MULTIPLE_LINE;
00965             break;
00966         case chLatin_s:
00967             ret = SINGLE_LINE;
00968             break;
00969         case chLatin_x:
00970             ret = EXTENDED_COMMENT;
00971             break;
00972         case chLatin_F:
00973             ret = PROHIBIT_FIXED_STRING_OPTIMIZATION;
00974             break;
00975         case chLatin_H:
00976             ret = PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
00977             break;
00978         case chLatin_X:
00979             ret = XMLSCHEMA_MODE;
00980             break;
00981         default:
00982             break;
00983     }
00984 
00985     return ret;
00986 }
00987 
00988 struct RE_RuntimeContext {
00989     const Op    *op_;
00990     XMLSize_t   offs_;
00991 
00992     RE_RuntimeContext(const Op *op, XMLSize_t offs) : op_(op), offs_(offs) { }
00993 };
00994 
00995 int RegularExpression::match(Context* const context, const Op* const operations,
00996                              XMLSize_t offset) const
00997 {
00998     ValueStackOf<RE_RuntimeContext>* opStack=NULL;
00999     Janitor<ValueStackOf<RE_RuntimeContext> > janStack(NULL);
01000     if(context->fLimit > 256)
01001     {
01002         opStack=new ValueStackOf<RE_RuntimeContext>(16, context->fMemoryManager);
01003         janStack.reset(opStack);
01004     }
01005     const Op* tmpOp = operations;
01006     bool ignoreCase = isSet(context->fOptions, IGNORE_CASE);
01007     int doReturn;
01008 
01009     while (tmpOp != 0) {
01010         // no one wants to return -5, only -1, 0, and greater
01011         doReturn = -5;
01012 
01013         if (offset > context->fLimit || offset < context->fStart)
01014             doReturn = -1;
01015         else
01016         {
01017             switch(tmpOp->getOpType()) {
01018                 case Op::O_CHAR:
01019                     if (!matchChar(context, tmpOp->getData(), offset, ignoreCase))
01020                         doReturn = -1;
01021                     else
01022                         tmpOp = tmpOp->getNextOp();
01023                     break;
01024                 case Op::O_DOT:
01025                     if (!matchDot(context, offset))
01026                         doReturn = -1;
01027                     else
01028                         tmpOp = tmpOp->getNextOp();
01029                     break;
01030                 case Op::O_RANGE:
01031                 case Op::O_NRANGE:
01032                     if (!matchRange(context, tmpOp, offset, ignoreCase))
01033                         doReturn = -1;
01034                     else
01035                         tmpOp = tmpOp->getNextOp();
01036                     break;
01037                 case Op::O_ANCHOR:
01038                     if (!matchAnchor(context, tmpOp->getData(), offset))
01039                         doReturn = -1;
01040                     else
01041                         tmpOp = tmpOp->getNextOp();
01042                     break;
01043                 case Op::O_BACKREFERENCE:
01044                     if (!matchBackReference(context, tmpOp->getData(), offset,
01045                                             ignoreCase))
01046                         doReturn = -1;
01047                     else
01048                         tmpOp = tmpOp->getNextOp();
01049                     break;
01050                 case Op::O_STRING:
01051                     if (!matchString(context, tmpOp->getLiteral(), offset, ignoreCase))
01052                         doReturn = -1;
01053                     else
01054                         tmpOp = tmpOp->getNextOp();
01055                     break;
01056                 case Op::O_FINITE_CLOSURE:
01057                 {
01058                     XMLInt32 id = tmpOp->getData();
01059                     // if id is not -1, it's a closure with a child token having a minumum length,
01060                     // where id is the index of the fOffsets array where its status is stored
01061                     if (id >= 0) {
01062                         int prevOffset = context->fOffsets[id];
01063                         if (prevOffset < 0 || prevOffset != (int)offset) {
01064                             context->fOffsets[id] = (int)offset;
01065                         }
01066                         else {
01067                             // the status didn't change, we haven't found other copies; move on to the next match
01068                             context->fOffsets[id] = -1;
01069                             tmpOp = tmpOp->getNextOp();
01070                             break;
01071                         }
01072                     }
01073 
01074                     // match the subitems until they do
01075                     int ret;
01076                     while((ret = match(context, tmpOp->getChild(), offset)) != -1)
01077                     {
01078                         if(offset == (XMLSize_t)ret)
01079                             break;
01080                         offset = ret;
01081                     }
01082 
01083                     if (id >= 0) {
01084                         // loop has ended, reset the status for this closure
01085                         context->fOffsets[id] = -1;
01086                     }
01087                     tmpOp = tmpOp->getNextOp();
01088                 }
01089                 break;
01090                 case Op::O_FINITE_NONGREEDYCLOSURE:
01091                 {
01092                     int ret = match(context,tmpOp->getNextOp(),offset);
01093                     if (ret >= 0)
01094                         doReturn = ret;
01095                     else
01096                     {
01097                         // match the subitems until they do
01098                         int ret;
01099                         while((ret = match(context, tmpOp->getChild(), offset)) != -1)
01100                         {
01101                             if(offset == (XMLSize_t)ret)
01102                                 break;
01103                             offset = ret;
01104                         }
01105                         tmpOp = tmpOp->getNextOp();
01106                     }
01107                 }
01108                 break;
01109                 case Op::O_CLOSURE:
01110                 {
01111                     XMLInt32 id = tmpOp->getData();
01112                     // if id is not -1, it's a closure with a child token having a minumum length,
01113                     // where id is the index of the fOffsets array where its status is stored
01114                     if (id >= 0) {
01115                         int prevOffset = context->fOffsets[id];
01116                         if (prevOffset < 0 || prevOffset != (int)offset) {
01117                             context->fOffsets[id] = (int)offset;
01118                         }
01119                         else {
01120                             // the status didn't change, we haven't found other copies; move on to the next match
01121                             context->fOffsets[id] = -1;
01122                             tmpOp = tmpOp->getNextOp();
01123                             break;
01124                         }
01125                     }
01126 
01127                     if(opStack!=NULL)
01128                     {
01129                         opStack->push(RE_RuntimeContext(tmpOp, offset));
01130                         tmpOp = tmpOp->getChild();
01131                     }
01132                     else
01133                     {
01134                         int ret = match(context, tmpOp->getChild(), offset);
01135                         if (id >= 0) {
01136                             context->fOffsets[id] = -1;
01137                         }
01138                         if (ret >= 0)
01139                             doReturn = ret;
01140                         else
01141                             tmpOp = tmpOp->getNextOp();
01142                     }
01143                 }
01144                 break;
01145                 case Op::O_QUESTION:
01146                 {
01147                     if(opStack!=NULL)
01148                     {
01149                         opStack->push(RE_RuntimeContext(tmpOp, offset));
01150                         tmpOp = tmpOp->getChild();
01151                     }
01152                     else
01153                     {
01154                         int ret = match(context, tmpOp->getChild(), offset);
01155                         if (ret >= 0)
01156                             doReturn = ret;
01157                         else
01158                             tmpOp = tmpOp->getNextOp();
01159                     }
01160                 }
01161                 break;
01162                 case Op::O_NONGREEDYCLOSURE:
01163                 case Op::O_NONGREEDYQUESTION:
01164                 {
01165                     int ret = match(context,tmpOp->getNextOp(),offset);
01166                     if (ret >= 0)
01167                         doReturn = ret;
01168                     else
01169                         tmpOp = tmpOp->getChild();
01170                 }
01171                 break;
01172                 case Op::O_UNION:
01173                     doReturn = matchUnion(context, tmpOp, offset);
01174                     break;
01175                 case Op::O_CAPTURE:
01176                     if (context->fMatch != 0 && tmpOp->getData() != 0)
01177                         doReturn = matchCapture(context, tmpOp, offset);
01178                     else
01179                         tmpOp = tmpOp->getNextOp();
01180                     break;
01181             }
01182         }
01183         if (doReturn != -5) {
01184             if (opStack==NULL || opStack->size() == 0)
01185                 return doReturn;
01186             RE_RuntimeContext ctx = opStack->pop();
01187             tmpOp = ctx.op_;
01188             offset = ctx.offs_;
01189             if (tmpOp->getOpType() == Op::O_CLOSURE) {
01190                 XMLInt32 id = tmpOp->getData();
01191                 if (id >= 0) {
01192                     // loop has ended, reset the status for this closure
01193                     context->fOffsets[id] = -1;
01194                 }
01195             }
01196             if (tmpOp->getOpType() == Op::O_CLOSURE || tmpOp->getOpType() == Op::O_QUESTION) {
01197                 if (doReturn >= 0)
01198                     return doReturn;
01199             }
01200             tmpOp = tmpOp->getNextOp();
01201         }
01202     }
01203 
01204     return (int)offset;
01205 }
01206 
01207 bool RegularExpression::matchChar(Context* const context,
01208                                   const XMLInt32 ch, XMLSize_t& offset,
01209                                   const bool ignoreCase) const
01210 {
01211     if (offset >= context->fLimit)
01212         return false;
01213 
01214     XMLInt32 strCh = 0;
01215 
01216     if (!context->nextCh(strCh, offset))
01217         return false;
01218 
01219     bool match = ignoreCase ? matchIgnoreCase(ch, strCh)
01220                             : (ch == strCh);
01221     if (!match)
01222         return false;
01223 
01224     ++offset;
01225 
01226     return true;
01227 }
01228 
01229 bool RegularExpression::matchDot(Context* const context, XMLSize_t& offset) const
01230 {
01231     if (offset >= context->fLimit)
01232         return false;
01233 
01234     XMLInt32 strCh = 0;
01235 
01236     if (!context->nextCh(strCh, offset))
01237         return false;
01238 
01239     if (!isSet(context->fOptions, SINGLE_LINE)) {
01240 
01241         if (RegxUtil::isEOLChar(strCh))
01242             return false;
01243     }
01244 
01245     ++offset;
01246     return true;
01247 }
01248 
01249 bool RegularExpression::matchRange(Context* const context, const Op* const op,
01250                                    XMLSize_t& offset, const bool ignoreCase) const
01251 {
01252     if (offset >= context->fLimit)
01253         return false;
01254 
01255     XMLInt32 strCh = 0;
01256 
01257     if (!context->nextCh(strCh, offset))
01258         return false;
01259 
01260     RangeToken* tok = (RangeToken *) op->getToken();
01261     bool match = false;
01262 
01263     if (ignoreCase) {
01264         tok = tok->getCaseInsensitiveToken(fTokenFactory);
01265     }
01266 
01267     match = tok->match(strCh);
01268 
01269     if (!match)
01270         return false;
01271 
01272     ++offset;
01273     return true;
01274 }
01275 
01276 bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch,
01277                                     const XMLSize_t offset) const
01278 {
01279     switch ((XMLCh) ch) {
01280     case chDollarSign:
01281         if (isSet(context->fOptions, MULTIPLE_LINE)) {
01282             if (!(offset == context->fLimit || (offset < context->fLimit
01283                 && RegxUtil::isEOLChar(context->fString[offset]))))
01284                 return false;
01285         }
01286         else {
01287 
01288             if (!(offset == context->fLimit
01289                 || (offset+1 == context->fLimit
01290                     && RegxUtil::isEOLChar(context->fString[offset]))
01291                 || (offset+2 == context->fLimit
01292                     && context->fString[offset] == chCR
01293                     && context->fString[offset+1] == chLF)))
01294                 return false;
01295         }
01296         break;
01297     case chCaret:
01298         if (!isSet(context->fOptions, MULTIPLE_LINE)) {
01299 
01300             if (offset != context->fStart)
01301                 return false;
01302         }
01303         else {
01304 
01305             if (!(offset == context->fStart || (offset > context->fStart
01306                 && RegxUtil::isEOLChar(context->fString[offset-1]))))
01307                 return false;
01308         }
01309         break;
01310     }
01311 
01312     return true;
01313 }
01314 
01315 bool RegularExpression::matchBackReference(Context* const context,
01316                                            const XMLInt32 refNo, XMLSize_t& offset,
01317                                            const bool ignoreCase) const
01318 {
01319     if (refNo <=0 || refNo >= fNoGroups)
01320         ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, context->fMemoryManager);
01321 
01322     // If the group we're matching against wasn't matched,
01323     // the back reference matches the empty string
01324     if (context->fMatch->getStartPos(refNo) < 0 || context->fMatch->getEndPos(refNo) < 0)
01325         return true;
01326 
01327     int start = context->fMatch->getStartPos(refNo);
01328     int length = context->fMatch->getEndPos(refNo) - start;
01329 
01330     if (int(context->fLimit - offset) < length)
01331         return false;
01332 
01333     bool match = ignoreCase ? XMLString::regionIMatches(context->fString,(int)offset,
01334                                                         context->fString,start,length)
01335                             : XMLString::regionMatches(context->fString, (int)offset,
01336                                                        context->fString, start,length);
01337 
01338     if (match) offset += length;
01339     return match;
01340 }
01341 
01342 bool RegularExpression::matchString(Context* const context,
01343                                     const XMLCh* const literal, XMLSize_t& offset,
01344                                     const bool ignoreCase) const
01345 {
01346     XMLSize_t length = XMLString::stringLen(literal);
01347 
01348     if (context->fLimit - offset < length)
01349         return false;
01350 
01351     bool match = ignoreCase ? XMLString::regionIMatches(context->fString, (int)offset,
01352                                                         literal, 0, length)
01353                             : XMLString::regionMatches(context->fString, (int)offset,
01354                                                        literal, 0, length);
01355     if (match) offset += length;
01356     return match;
01357 }
01358 
01359 int RegularExpression::matchCapture(Context* const context, const Op* const op,
01360                                     XMLSize_t offset) const
01361 {
01362     // No check is made for nullness of fMatch as the function is only called if
01363     // fMatch is not null.
01364     XMLInt32 index = op->getData();
01365     int save = (index > 0) ? context->fMatch->getStartPos(index)
01366                            : context->fMatch->getEndPos(-index);
01367 
01368     if (index > 0) {
01369         context->fMatch->setStartPos(index, (int)offset);
01370         int ret = match(context, op->getNextOp(), offset);
01371         if (ret < 0)
01372             context->fMatch->setStartPos(index, save);
01373         return ret;
01374     }
01375 
01376     context->fMatch->setEndPos(-index, (int)offset);
01377     int ret = match(context, op->getNextOp(), offset);
01378     if (ret < 0)
01379         context->fMatch->setEndPos(-index, save);
01380     return ret;
01381 }
01382 
01383 int RegularExpression::matchUnion(Context* const context,
01384                                    const Op* const op, XMLSize_t offset) const
01385 {
01386     XMLSize_t opSize = op->getSize();
01387 
01388     Context bestResultContext;
01389     int bestResult=-1;
01390     for(XMLSize_t i=0; i < opSize; i++) {
01391         Context tmpContext(context);
01392         int ret = match(&tmpContext, op->elementAt(i), offset);
01393         if (ret >= 0 && (XMLSize_t)ret <= context->fLimit && ret>bestResult)
01394         {
01395             bestResult=ret;
01396             bestResultContext=tmpContext;
01397             // exit early, if we reached the end of the string
01398             if((XMLSize_t)ret == context->fLimit)
01399                 break;
01400         }
01401     }
01402     if(bestResult!=-1)
01403         *context=bestResultContext;
01404     return bestResult;
01405 }
01406 
01407 
01408 int RegularExpression::parseOptions(const XMLCh* const options)
01409 {
01410 
01411     if (options == 0)
01412         return 0;
01413 
01414     int opts = 0;
01415     XMLSize_t length = XMLString::stringLen(options);
01416 
01417     for (XMLSize_t i=0; i < length; i++) {
01418 
01419         int v = getOptionValue(options[i]);
01420 
01421         if (v == 0)
01422             ThrowXMLwithMemMgr1(ParseException, XMLExcepts::Regex_UnknownOption, options, fMemoryManager);
01423 
01424         opts |= v;
01425     }
01426 
01427     return opts;
01428 }
01429 
01430 void RegularExpression::compile(const Token* const token) {
01431 
01432     if (fOperations != 0)
01433         return;
01434 
01435     fNoClosures = 0;
01436     fOperations = compile(token, 0, false);
01437 }
01438 
01439 Op* RegularExpression::compile(const Token* const token, Op* const next,
01440                                const bool reverse)
01441 {
01442 
01443     Op* ret = 0;
01444 
01445     const Token::tokType tokenType = token->getTokenType();
01446 
01447     switch(tokenType) {
01448     case Token::T_DOT:
01449         ret = fOpFactory.createDotOp();
01450         ret->setNextOp(next);
01451         break;
01452     case Token::T_CHAR:
01453         ret = fOpFactory.createCharOp(token->getChar());
01454         ret->setNextOp(next);
01455         break;
01456     case Token::T_ANCHOR:
01457         ret = fOpFactory.createAnchorOp(token->getChar());
01458         ret->setNextOp(next);
01459         break;
01460     case Token::T_RANGE:
01461     case Token::T_NRANGE:
01462         ret = fOpFactory.createRangeOp(token);
01463         ret->setNextOp(next);
01464         break;
01465     case Token::T_STRING:
01466         ret = fOpFactory.createStringOp(token->getString());
01467         ret->setNextOp(next);
01468         break;
01469     case Token::T_BACKREFERENCE:
01470         ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
01471         ret->setNextOp(next);
01472         break;
01473     case Token::T_EMPTY:
01474         ret = next;
01475         break;
01476     case Token::T_CONCAT:
01477         ret = compileConcat(token, next, reverse);
01478         break;
01479     case Token::T_UNION:
01480         ret = compileUnion(token, next, reverse);
01481         break;
01482     case Token::T_CLOSURE:
01483     case Token::T_NONGREEDYCLOSURE:
01484         ret = compileClosure(token, next, reverse, tokenType);
01485         break;
01486     case Token::T_PAREN:
01487         ret = compileParenthesis(token, next, reverse);
01488         break;
01489     default:
01490         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_UnknownTokenType, fMemoryManager);
01491         break; // this line to be deleted
01492     }
01493 
01494     return ret;
01495 }
01496 
01497 /*
01498  * Prepares for matching. This method is called during construction.
01499  */
01500 void RegularExpression::prepare() {
01501 
01502     compile(fTokenTree);
01503 
01504     fMinLength = fTokenTree->getMinLength();
01505     fFirstChar = 0;
01506 
01507     if (!isSet(fOptions, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) &&
01508         !isSet(fOptions, XMLSCHEMA_MODE))                            {
01509 
01510         RangeToken* rangeTok = fTokenFactory->createRange();
01511         Token::firstCharacterOptions result = fTokenTree->analyzeFirstCharacter(rangeTok, fOptions, fTokenFactory);
01512 
01513         if (result == Token::FC_TERMINAL) {
01514 
01515             rangeTok->compactRanges();
01516             fFirstChar = rangeTok;
01517         }
01518 
01519         rangeTok->createMap();
01520 
01521         if (isSet(fOptions, IGNORE_CASE))
01522         {
01523             rangeTok->getCaseInsensitiveToken(fTokenFactory);
01524         }
01525     }
01526 
01527     if (fOperations != 0 && fOperations->getNextOp() == 0 &&
01528         (fOperations->getOpType() == Op::O_STRING ||
01529          fOperations->getOpType() == Op::O_CHAR) &&
01530          !isSet(fOptions, IGNORE_CASE) )                      {
01531 
01532         fFixedStringOnly = true;
01533 
01534         if (fOperations->getOpType() == Op::O_STRING) {
01535             fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
01536             fFixedString = XMLString::replicate(fOperations->getLiteral(), fMemoryManager);
01537         }
01538         else{
01539 
01540             XMLInt32 ch = fOperations->getData();
01541 
01542             if ( ch >= 0x10000) { // add as constant
01543                 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
01544                 fFixedString = RegxUtil::decomposeToSurrogates(ch, fMemoryManager);
01545             }
01546             else {
01547 
01548                 XMLCh* dummyStr = (XMLCh*) fMemoryManager->allocate(2 * sizeof(XMLCh));//new XMLCh[2];
01549                 dummyStr[0] = (XMLCh) fOperations->getData();
01550                 dummyStr[1] = chNull;
01551                 fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
01552                 fFixedString = dummyStr;
01553             }
01554         }
01555 
01556         fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
01557                                                     isSet(fOptions, IGNORE_CASE), fMemoryManager);
01558     }
01559     else if (!isSet(fOptions, XMLSCHEMA_MODE) &&
01560              !isSet(fOptions, PROHIBIT_FIXED_STRING_OPTIMIZATION) &&
01561              !isSet(fOptions, IGNORE_CASE)) {
01562 
01563         int fixedOpts = 0;
01564         Token* tok = fTokenTree->findFixedString(fOptions, fixedOpts);
01565 
01566         fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
01567 
01568         fFixedString = (tok == 0) ? 0
01569             : XMLString::replicate(tok->getString(), fMemoryManager);
01570 
01571         if (fFixedString != 0 && XMLString::stringLen(fFixedString) < 2) {
01572 
01573             fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;
01574             fFixedString = 0;
01575         }
01576 
01577         if (fFixedString != 0) {
01578 
01579             fBMPattern = new (fMemoryManager) BMPattern(fFixedString, 256,
01580                                                         isSet(fixedOpts, IGNORE_CASE), fMemoryManager);
01581         }
01582     }
01583 }
01584 
01585 bool RegularExpression::doTokenOverlap(const Op* op, Token* token)
01586 {
01587     if(op->getOpType()==Op::O_RANGE)
01588     {
01589         RangeToken* t1=(RangeToken*)op->getToken();
01590         switch(token->getTokenType())
01591         {
01592         case Token::T_CHAR:
01593             return t1->match(token->getChar());
01594         case Token::T_STRING:
01595             return t1->match(*token->getString());
01596         case Token::T_RANGE:
01597             {
01598                 try
01599                 {
01600                     RangeToken tempRange(Token::T_RANGE, fMemoryManager);
01601                     tempRange.mergeRanges(t1);
01602                     tempRange.intersectRanges((RangeToken*)token);
01603                     return !tempRange.empty();
01604                 }
01605                 catch(RuntimeException&)
01606                 {
01607                 }
01608                 break;
01609             }
01610         default:
01611             break;
01612         }
01613         return true;
01614     }
01615 
01616     XMLInt32 ch=0;
01617     if(op->getOpType()==Op::O_CHAR)
01618         ch=op->getData();
01619     else if(op->getOpType()==Op::O_STRING)
01620         ch=*op->getLiteral();
01621 
01622     if(ch!=0)
01623     {
01624         switch(token->getTokenType())
01625         {
01626         case Token::T_CHAR:
01627             return token->getChar()==ch;
01628         case Token::T_STRING:
01629             return *token->getString()==ch;
01630         case Token::T_RANGE:
01631         case Token::T_NRANGE:
01632             return ((RangeToken*)token)->match(ch);
01633         default:
01634             break;
01635         }
01636     }
01637     // in any other case, there is the chance that they overlap
01638     return true;
01639 }
01640 
01641 XERCES_CPP_NAMESPACE_END
01642