00001
00002
00003 #if defined( _MBCS )
00004 #pragma message( __FILEINFO__ "This code is broken under _MBCS, " \
00005 "see the comments at the top of this file." )
00006 #endif //_MBCS
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070 #include "stdafx.h"
00071 #include "regexp.h"
00072
00073
00074
00075
00076 const TCHAR MAGIC = ((TCHAR)'\234');
00077
00078 #define new DEBUG_NEW
00079
00080 #pragma warning( disable : 4711 ) // automatic inline selected
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115 enum {
00116
00117 END = 0,
00118 BOL = 1,
00119 EOL = 2,
00120 ANY = 3,
00121 ANYOF = 4,
00122 ANYBUT = 5,
00123 BRANCH = 6,
00124 BACK = 7,
00125 EXACTLY = 8,
00126 NOTHING = 9,
00127 STAR = 10,
00128 PLUS = 11,
00129 WORDA = 12,
00130 WORDZ = 13,
00131 OPEN = 20,
00132
00133 CLOSE = 30
00134 };
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166 enum
00167 {
00168 REGERR_SENTINEL_VALUE = 0,
00169 REGERR_NULLARG = 1, REGERR_CORRUPTED, REGERR_CORRUPTION, REGERR_CORRUPTED_POINTERS,
00170 REGERR_BAD_REGREPEAT, REGERR_CORRUPTED_OPCODE, REGERR_NULL_TO_REGSUB,
00171 REGERR_DAMAGED_REGEXP_REGSUB, REGERR_DAMAGED_MATCH_STRING, REGERR_NULL_TO_REGCOMP,
00172 REGERR_TO_BIG, REGERR_TO_MANY_PAREN, REGERR_UNTERMINATED_PAREN, REGERR_UNMATCHED_PAREN,
00173 REGERR_INTERNAL_ERROR_JUNK, REGERR_OP_COULD_BE_EMPTY, REGERR_NESTED_OP, REGERR_INVALID_RANGE,
00174 REGERR_UNMATCHED_BRACE, REGERR_INTERNAL_UNEXPECTED_CHAR, REGERR_OP_FOLLOWS_NOTHING,
00175 REGERR_TRAILING_ESC, REGERR_INTERNAL_STRSCSPN, REGERR_NO_REGEXP
00176 };
00177
00178 struct regErr
00179 {
00180 int m_id;
00181 LPCTSTR m_err;
00182 } errors[] = {
00183 { REGERR_NULLARG, _T( "NULL argument to regexec" ) },
00184 { REGERR_CORRUPTED, _T( "corrupted regexp" ) },
00185 { REGERR_CORRUPTION, _T( "regexp corruption" ) },
00186 { REGERR_CORRUPTED_POINTERS, _T( "corrupted pointers" ) },
00187 { REGERR_BAD_REGREPEAT, _T( "internal error: bad call of regrepeat" ) },
00188 { REGERR_CORRUPTED_OPCODE, _T( "corrupted opcode" ) },
00189 { REGERR_NULL_TO_REGSUB, _T( "NULL parm to regsub" ) },
00190 { REGERR_DAMAGED_REGEXP_REGSUB, _T( "damaged regexp fed to regsub" ) },
00191 { REGERR_DAMAGED_MATCH_STRING, _T( "damaged match string" ) },
00192 { REGERR_NULL_TO_REGCOMP, _T( "NULL argument to regcomp" ) },
00193 { REGERR_TO_BIG, _T( "regexp too big" ) },
00194 { REGERR_TO_MANY_PAREN, _T( "too many ()" ) },
00195 { REGERR_UNTERMINATED_PAREN, _T( "unterminated ()" ) },
00196 { REGERR_UNMATCHED_PAREN, _T( "unmatched ()" ) },
00197 { REGERR_INTERNAL_ERROR_JUNK, _T( "internal error: junk on end" ) },
00198 { REGERR_OP_COULD_BE_EMPTY, _T( "*+ operand could be empty" ) },
00199 { REGERR_NESTED_OP, _T( "nested *?+" ) },
00200 { REGERR_INVALID_RANGE, _T( "invalid [] range" ) },
00201 { REGERR_UNMATCHED_BRACE, _T( "unmatched []" ) },
00202 { REGERR_INTERNAL_UNEXPECTED_CHAR, _T( "internal error: \\0|) unexpected" ) },
00203 { REGERR_OP_FOLLOWS_NOTHING, _T( "?+* follows nothing" ) },
00204 { REGERR_TRAILING_ESC, _T( "trailing \\" ) },
00205 { REGERR_INTERNAL_STRSCSPN, _T( "internal error: strcspn 0" ) },
00206 { REGERR_NO_REGEXP, _T( "NULL regexp" ) },
00207 { REGERR_SENTINEL_VALUE, _T( "Unknown error") }
00208 };
00209
00210
00211
00212 enum {
00213 HASWIDTH = 01,
00214 SIMPLE = 02,
00215 SPSTART = 04,
00216 WORST = 0
00217 };
00218
00220
00221 class CRegErrorHandler
00222 {
00223 friend Regexp;
00224 mutable CString m_szError;
00225 static LPCTSTR FindErr( int id );
00226 protected:
00227 void ClearErrorString() const;
00228 void regerror( LPCTSTR s ) const;
00229 void regerror( int id ) const;
00230 public:
00231 CRegErrorHandler() { }
00232 CRegErrorHandler( const CRegErrorHandler & reh ) : m_szError( reh.m_szError ) {}
00233
00234 const CString & GetErrorString() const;
00235 };
00236
00237 void CRegErrorHandler::regerror( LPCTSTR s ) const
00238 {
00239 TRACE1( "regerror: %s\n", s );
00240 m_szError = s;
00241 }
00242
00243 void CRegErrorHandler::regerror( int id ) const
00244 {
00245 regerror( FindErr( id ) );
00246 }
00247
00248 const CString & CRegErrorHandler::GetErrorString() const
00249 {
00250 return m_szError;
00251 }
00252
00253 void CRegErrorHandler::ClearErrorString() const
00254 {
00255 m_szError.Empty();
00256 }
00257
00258 LPCTSTR CRegErrorHandler::FindErr( int id )
00259 {
00260 struct regErr * perr;
00261 for ( perr = errors; perr->m_id != REGERR_SENTINEL_VALUE; perr++ )
00262 if ( perr->m_id == id )
00263 return perr->m_err;
00264
00265 return perr->m_err;
00266 }
00267
00269
00270
00271 class CRegProgramAccessor : public CRegErrorHandler
00272 {
00273 public:
00274 static inline TCHAR OP( LPTSTR p )
00275 {
00276 return (*(p));
00277 }
00278 static inline LPTSTR OPERAND( LPTSTR p )
00279 {
00280 return p + 3;
00281 }
00282 static inline LPTSTR regnext( LPTSTR p )
00283 {
00284 const short &offset = *((short*)(p+1));
00285
00286 if (offset == 0)
00287 return(NULL);
00288
00289 return((OP(p) == BACK) ? p-offset : p+offset);
00290 }
00291 #ifdef _RE_DEBUG
00292 LPTSTR CRegProgramAccessor::regprop( LPTSTR op );
00293 #endif
00294 };
00295
00297
00298
00299
00300
00301 class regexp : public CRegProgramAccessor
00302 {
00303 friend class CRegExecutor;
00304 friend class Regexp;
00305
00306 int m_programSize;
00307 LPTSTR startp[Regexp::NSUBEXP];
00308 LPTSTR endp[Regexp::NSUBEXP];
00309 TCHAR regstart;
00310 TCHAR reganch;
00311 LPTSTR regmust;
00312 int regmlen;
00313 LPTSTR program;
00314
00315 bool status;
00316 int count;
00317 int numSubs;
00318 public:
00319
00320 regexp( LPCTSTR exp, BOOL iCase );
00321 regexp( const regexp & r );
00322 ~regexp();
00323
00324 void ignoreCase( const TCHAR * in, TCHAR * out );
00325
00326 bool regcomp( LPCTSTR exp );
00327 bool regexec( LPCTSTR string );
00328 bool Status() const { return status; }
00329
00330 CString GetReplaceString( const TCHAR* sReplaceExp ) const;
00331
00332 regexp * getCopy();
00333
00334 #ifdef _RE_DEBUG
00335 void regdump();
00336 #endif
00337
00338 #ifdef _DEBUG
00339 CString m_originalPattern;
00340 CString m_modifiedPattern;
00341 #endif
00342 };
00343
00345
00346
00347 class CRegCompilerBase : public CRegProgramAccessor
00348 {
00349 public:
00350 CRegCompilerBase( LPCTSTR parse );
00351
00352 LPTSTR reg(int paren, int *flagp);
00353 protected:
00354 LPTSTR regparse;
00355 int regnpar;
00356
00357 LPTSTR regbranch(int *flagp);
00358 LPTSTR regpiece(int *flagp);
00359 LPTSTR regatom(int *flagp);
00360 inline bool ISREPN( TCHAR c ) { return ((c) == '*' || (c) == '+' || (c) == '?'); }
00361
00362 virtual void regc(int c) = 0;
00363 virtual LPTSTR regnode(int op) = 0;
00364 virtual void reginsert(TCHAR op, LPTSTR opnd) = 0;
00365 virtual void regtail(LPTSTR p, LPTSTR val) = 0;
00366 virtual void regoptail(LPTSTR p, LPTSTR val) = 0;
00367 };
00368
00370
00371
00372
00373 class CRegValidator : public CRegCompilerBase
00374 {
00375 public:
00376 CRegValidator( LPCTSTR parse );
00377
00378 inline long Size() const { return regsize; }
00379 private:
00380 long regsize;
00381 TCHAR regdummy[3];
00382 protected:
00383 LPTSTR regnode(int) { regsize += 3; return regdummy; }
00384 void regc(int) { regsize++; }
00385 void reginsert(TCHAR, LPTSTR) { regsize += 3; }
00386 void regtail(LPTSTR, LPTSTR) { return; }
00387 void regoptail(LPTSTR, LPTSTR) { return; }
00388 };
00389
00391
00392
00393 class CRegCompiler : public CRegCompilerBase
00394 {
00395 public:
00396 CRegCompiler( LPCTSTR parse, LPTSTR prog );
00397 private:
00398 LPTSTR regcode;
00399 protected:
00400
00401 void regc(int b)
00402 {
00403 *regcode++ = (TCHAR)b;
00404 }
00405 LPTSTR regnode(int op);
00406 void reginsert(TCHAR op, LPTSTR opnd);
00407 void regtail(LPTSTR p, LPTSTR val);
00408 void regoptail(LPTSTR p, LPTSTR val);
00409 };
00410
00411
00412 LPTSTR CRegCompiler::regnode(int op)
00413 {
00414 LPTSTR const ret = regcode;
00415
00416 LPTSTR ptr = ret;
00417 *ptr++ = (TCHAR)op;
00418 *ptr++ = '\0';
00419 *ptr++ = '\0';
00420 regcode = ptr;
00421
00422 return(ret);
00423 }
00424
00425
00426
00427
00428 void CRegCompiler::reginsert(TCHAR op, LPTSTR opnd)
00429 {
00430 LPTSTR place;
00431
00432 (void) memmove(opnd+3, opnd, (size_t)((regcode - opnd)*sizeof(TCHAR)));
00433 regcode += 3;
00434
00435 place = opnd;
00436 *place++ = op;
00437 *place++ = '\0';
00438 *place++ = '\0';
00439 }
00440
00441
00442 void CRegCompiler::regtail(LPTSTR p, LPTSTR val)
00443 {
00444 LPTSTR scan;
00445 LPTSTR temp;
00446
00447
00448 for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
00449 continue;
00450
00451 *((short *)(scan+1)) = (short)((OP(scan) == BACK) ? scan - val : val - scan);
00452 }
00453
00454
00455 void CRegCompiler::regoptail(LPTSTR p, LPTSTR val)
00456 {
00457
00458 if (OP(p) == BRANCH)
00459 regtail(OPERAND(p), val);
00460 }
00461
00463
00464 CRegCompilerBase::CRegCompilerBase( LPCTSTR parse )
00465 : regparse( (LPTSTR)parse ),
00466 regnpar(1)
00467 {
00468 }
00469
00470 CRegValidator::CRegValidator( LPCTSTR parse )
00471 : CRegCompilerBase( parse ),
00472 regsize(0)
00473 {
00474 regc(MAGIC);
00475 regdummy[0] = NOTHING;
00476 regdummy[1] = regdummy[2] = 0;
00477 }
00478
00479 CRegCompiler::CRegCompiler( LPCTSTR parse, LPTSTR prog )
00480 : CRegCompilerBase( parse ),
00481 regcode(prog)
00482 {
00483 regc(MAGIC);
00484 }
00485
00487
00488 regexp::regexp( LPCTSTR exp, BOOL iCase )
00489 : regstart(0),
00490 reganch(0),
00491 regmust(0),
00492 regmlen(0),
00493 program(0),
00494 m_programSize(0)
00495 {
00496 #if _DEBUG
00497 m_originalPattern = exp;
00498 #endif
00499
00500 if ( iCase )
00501 {
00502 TCHAR * out = new TCHAR[(_tcslen( exp ) * 4) + 1];
00503 ignoreCase( exp, out );
00504
00505 #if _DEBUG
00506 m_modifiedPattern = out;
00507 #endif
00508 status = regcomp( out );
00509 delete [] out;
00510 }
00511 else
00512 status = regcomp( exp );
00513
00514 count = numSubs = 0;
00515 }
00516
00517 regexp::regexp( const regexp & orig )
00518 : regstart(orig.regstart),
00519 reganch(orig.reganch),
00520 regmlen(orig.regmlen),
00521 m_programSize(orig.m_programSize),
00522 numSubs(orig.numSubs),
00523 regmust(0)
00524 {
00525 #if _DEBUG
00526 m_originalPattern = orig.m_originalPattern;
00527 m_modifiedPattern = orig.m_modifiedPattern;
00528 #endif
00529 status = orig.status;
00530 count = 0;
00531 program = new TCHAR[m_programSize];
00532 memcpy( program, orig.program, m_programSize * sizeof( TCHAR ) );
00533 if ( orig.regmust )
00534 regmust = program + ( orig.regmust - orig.program );
00535
00536 for ( int i = Regexp::NSUBEXP; i > 0; i--)
00537 {
00538 startp[i] = orig.startp[i];
00539 endp[i] = orig.endp[i];
00540 }
00541 }
00542
00543 regexp::~regexp()
00544 {
00545 delete [] program;
00546 }
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564 bool regexp::regcomp(LPCTSTR exp)
00565 {
00566 LPTSTR scan;
00567 int flags;
00568
00569 if (exp == NULL)
00570 {
00571 regerror( REGERR_NULL_TO_REGCOMP );
00572 return NULL;
00573 }
00574
00575
00576 CRegValidator tester( exp );
00577
00578 if (tester.reg(0, &flags) == NULL)
00579 return false;
00580
00581
00582 if (tester.Size() >= 0x7fffL)
00583 {
00584 regerror(REGERR_TO_BIG);
00585 return NULL;
00586 }
00587
00588 m_programSize = tester.Size();
00589
00590 program = new TCHAR[m_programSize];
00591
00592 CRegCompiler comp( exp, program );
00593
00594 if (comp.reg(0, &flags) == NULL)
00595 return false;
00596
00597 scan = program + 1;
00598 if (OP(regnext(scan)) == END)
00599 {
00600 scan = OPERAND(scan);
00601
00602
00603 if (OP(scan) == EXACTLY)
00604 regstart = *OPERAND(scan);
00605 else if (OP(scan) == BOL)
00606 reganch = 1;
00607
00608
00609
00610
00611
00612
00613
00614
00615 if (flags&SPSTART)
00616 {
00617 LPTSTR longest = NULL;
00618 size_t len = 0;
00619
00620 for (; scan != NULL; scan = regnext(scan))
00621 if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
00622 {
00623 longest = OPERAND(scan);
00624 len = _tcslen(OPERAND(scan));
00625 }
00626 regmust = longest;
00627 regmlen = (int)len;
00628 }
00629 }
00630
00631 return true;
00632 }
00633
00634 regexp * regexp::getCopy()
00635 {
00636 return new regexp( *this );
00637 }
00638
00639
00640
00641
00642
00643
00644
00645
00646
00647 LPTSTR CRegCompilerBase::reg( int paren, int *flagp )
00648 {
00649 LPTSTR ret;
00650 LPTSTR br;
00651 LPTSTR ender;
00652 int parno = 0;
00653 int flags;
00654
00655 *flagp = HASWIDTH;
00656
00657 if (paren)
00658 {
00659
00660 if (regnpar >= Regexp::NSUBEXP)
00661 {
00662 regerror(REGERR_TO_MANY_PAREN);
00663 return NULL;
00664 }
00665 parno = regnpar;
00666 regnpar++;
00667 ret = regnode(OPEN+parno);
00668 }
00669
00670
00671 br = regbranch(&flags);
00672 if (br == NULL)
00673 return(NULL);
00674 if (paren)
00675 regtail(ret, br);
00676 else
00677 ret = br;
00678 *flagp &= ~(~flags&HASWIDTH);
00679 *flagp |= flags&SPSTART;
00680 while (*regparse == '|')
00681 {
00682 regparse++;
00683 br = regbranch(&flags);
00684 if (br == NULL)
00685 return(NULL);
00686 regtail(ret, br);
00687 *flagp &= ~(~flags&HASWIDTH);
00688 *flagp |= flags&SPSTART;
00689 }
00690
00691
00692 ender = regnode((paren) ? CLOSE+parno : END);
00693 regtail(ret, ender);
00694
00695
00696 for (br = ret; br != NULL; br = regnext(br))
00697 regoptail(br, ender);
00698
00699
00700 if (paren && *regparse++ != ')')
00701 {
00702 regerror( REGERR_UNTERMINATED_PAREN );
00703 return NULL;
00704 }
00705 else if (!paren && *regparse != '\0')
00706 {
00707 if (*regparse == ')')
00708 {
00709 regerror( REGERR_UNMATCHED_PAREN );
00710 return NULL;
00711 }
00712 else
00713 {
00714 regerror( REGERR_INTERNAL_ERROR_JUNK );
00715 return NULL;
00716 }
00717
00718 }
00719
00720 return(ret);
00721 }
00722
00723
00724
00725
00726
00727 LPTSTR CRegCompilerBase::regbranch(int *flagp)
00728 {
00729 LPTSTR ret;
00730 LPTSTR chain;
00731 LPTSTR latest;
00732 int flags;
00733 int c;
00734
00735 *flagp = WORST;
00736
00737 ret = regnode(BRANCH);
00738 chain = NULL;
00739 while ((c = *regparse) != '\0' && c != '|' && c != ')')
00740 {
00741 latest = regpiece(&flags);
00742 if (latest == NULL)
00743 return(NULL);
00744 *flagp |= flags&HASWIDTH;
00745 if (chain == NULL)
00746 *flagp |= flags&SPSTART;
00747 else
00748 regtail(chain, latest);
00749 chain = latest;
00750 }
00751 if (chain == NULL)
00752 (void) regnode(NOTHING);
00753
00754 return(ret);
00755 }
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765 LPTSTR CRegCompilerBase::regpiece(int *flagp)
00766 {
00767 LPTSTR ret;
00768 TCHAR op;
00769 LPTSTR next;
00770 int flags;
00771
00772 ret = regatom(&flags);
00773 if (ret == NULL)
00774 return(NULL);
00775
00776 op = *regparse;
00777 if (!ISREPN(op))
00778 {
00779 *flagp = flags;
00780 return(ret);
00781 }
00782
00783 if (!(flags&HASWIDTH) && op != _T( '?' ))
00784 {
00785 regerror( REGERR_OP_COULD_BE_EMPTY );
00786 return NULL;
00787 }
00788 switch (op)
00789 {
00790 case _T( '*' ): *flagp = WORST|SPSTART; break;
00791 case _T( '+' ): *flagp = WORST|SPSTART|HASWIDTH; break;
00792 case _T( '?' ): *flagp = WORST; break;
00793 }
00794
00795 if (op == _T( '*' ) && (flags&SIMPLE))
00796 reginsert(STAR, ret);
00797 else if (op == _T( '*' ))
00798 {
00799
00800 reginsert(BRANCH, ret);
00801 regoptail(ret, regnode(BACK));
00802 regoptail(ret, ret);
00803 regtail(ret, regnode(BRANCH));
00804 regtail(ret, regnode(NOTHING));
00805 }
00806 else if (op == _T( '+' ) && (flags&SIMPLE))
00807 reginsert(PLUS, ret);
00808 else if (op == _T( '+' ))
00809 {
00810
00811 next = regnode(BRANCH);
00812 regtail(ret, next);
00813 regtail(regnode(BACK), ret);
00814 regtail(next, regnode(BRANCH));
00815 regtail(ret, regnode(NOTHING));
00816 }
00817 else if (op == _T( '?' ))
00818 {
00819
00820 reginsert(BRANCH, ret);
00821 regtail(ret, regnode(BRANCH));
00822 next = regnode(NOTHING);
00823 regtail(ret, next);
00824 regoptail(ret, next);
00825 }
00826 regparse++;
00827 if (ISREPN(*regparse))
00828 {
00829 regerror( REGERR_NESTED_OP );
00830 return NULL;
00831 }
00832
00833 return(ret);
00834 }
00835
00836
00837
00838
00839
00840
00841
00842
00843 LPTSTR CRegCompilerBase::regatom(int * flagp)
00844 {
00845 LPTSTR ret;
00846 int flags;
00847
00848 *flagp = WORST;
00849
00850 switch ( *regparse++ )
00851 {
00852
00853 case '^':
00854 ret = regnode(BOL);
00855 break;
00856 case '$':
00857 ret = regnode(EOL);
00858 break;
00859 case '.':
00860 ret = regnode(ANY);
00861 *flagp |= HASWIDTH|SIMPLE;
00862 break;
00863 case '[':
00864 {
00865 int range;
00866 int rangeend;
00867 int c;
00868
00869 if (*regparse == '^')
00870 {
00871 ret = regnode(ANYBUT);
00872 regparse++;
00873 }
00874 else
00875 ret = regnode(ANYOF);
00876 if ((c = *regparse) == ']' || c == '-')
00877 {
00878 regc(c);
00879 regparse++;
00880 }
00881 while ((c = *regparse++ ) != '\0' && c != ']')
00882 {
00883 if (c != '-')
00884 regc(c);
00885 else if ((c = *regparse) == ']' || c == '\0')
00886 regc('-');
00887 else
00888 {
00889 range = (TCHAR)*(regparse-2);
00890 rangeend = (TCHAR)c;
00891 if (range > rangeend)
00892 {
00893 regerror( REGERR_INVALID_RANGE );
00894 return NULL;
00895 }
00896 for (range++; range <= rangeend; range++)
00897 regc(range);
00898 regparse++;
00899 }
00900 }
00901 regc('\0');
00902 if (c != ']')
00903 {
00904 regerror( REGERR_UNMATCHED_BRACE );
00905 return NULL;
00906 }
00907 *flagp |= HASWIDTH|SIMPLE;
00908 break;
00909 }
00910 case '(':
00911 ret = reg(1, &flags);
00912 if (ret == NULL)
00913 return(NULL);
00914 *flagp |= flags&(HASWIDTH|SPSTART);
00915 break;
00916 case '\0':
00917 case '|':
00918 case ')':
00919
00920 regerror( REGERR_INTERNAL_UNEXPECTED_CHAR );
00921 return NULL;
00922 case '?':
00923 case '+':
00924 case '*':
00925 {
00926 regerror( REGERR_OP_FOLLOWS_NOTHING );
00927 return NULL;
00928 }
00929 case '\\':
00930 switch (*regparse++)
00931 {
00932 case '\0':
00933 {
00934 regerror( REGERR_TRAILING_ESC );
00935 return NULL;
00936 }
00937 case '<':
00938 ret = regnode(WORDA);
00939 break;
00940 case '>':
00941 ret = regnode(WORDZ);
00942 break;
00943
00944 default:
00945
00946 goto de_fault;
00947 }
00948 break;
00949 de_fault:
00950 default:
00951
00952
00953
00954
00955
00956
00957
00958
00959
00960
00961
00962
00963
00964
00965
00966
00967
00968
00969
00970
00971
00972
00973 {
00974 TCHAR *regprev;
00975 register TCHAR ch;
00976
00977 regparse--;
00978 ret = regnode(EXACTLY);
00979 for ( regprev = 0 ; ; ) {
00980 ch = *regparse++;
00981 switch (*regparse) {
00982
00983 default:
00984 regc(ch);
00985 break;
00986
00987 case '.': case '[': case '(':
00988 case ')': case '|': case '\n':
00989 case '$': case '^':
00990 case '\0':
00991
00992 magic:
00993 regc(ch);
00994 goto done;
00995
00996 case '?': case '+': case '*':
00997 if (!regprev)
00998 goto magic;
00999
01000 regparse = regprev;
01001 goto done;
01002
01003 case '\\':
01004 regc(ch);
01005 switch (regparse[1]){
01006 case '\0':
01007 case '<':
01008 case '>':
01009
01010 goto done;
01011 default:
01012
01013 regprev = regparse;
01014 regparse++;
01015 continue;
01016 }
01017 }
01018 regprev = regparse;
01019 }
01020 done:
01021 regc('\0');
01022 *flagp |= HASWIDTH;
01023 if (!regprev)
01024 *flagp |= SIMPLE;
01025 }
01026 break;
01027 }
01028
01029 return(ret);
01030 }
01031
01033
01034
01035
01036
01037 class CRegExecutor : public CRegProgramAccessor
01038 {
01039 friend bool regexp::regexec( LPCTSTR str );
01040
01041 LPTSTR reginput;
01042 LPTSTR regbol;
01043 LPTSTR * regstartp;
01044 LPTSTR * regendp;
01045
01046 regexp * prog;
01047 public:
01048 CRegExecutor( regexp * prog, LPTSTR string );
01049 protected:
01050 bool regtry( LPTSTR string );
01051 bool regmatch( LPTSTR prog );
01052 size_t regrepeat( LPTSTR node );
01053 };
01054
01055 CRegExecutor::CRegExecutor( regexp * p, LPTSTR string )
01056 : regbol( string ),
01057 regstartp( p->startp ),
01058 regendp( p->endp ),
01059 prog(p)
01060 {
01061 }
01062
01063 #ifdef _RE_DEBUG
01064 int regnarrate = 0;
01065 #endif
01066
01067
01068
01069 bool regexp::regexec( LPCTSTR str )
01070 {
01071 LPTSTR string = (LPTSTR)str;
01072
01073
01074 if ( string == NULL )
01075 {
01076 regerror( REGERR_NULLARG );
01077 return false;
01078 }
01079
01080
01081 if (*program != MAGIC)
01082 {
01083 regerror( REGERR_CORRUPTED );
01084 return false;
01085 }
01086
01087
01088 if ( regmust != NULL && _tcsstr( string, regmust ) == NULL )
01089 return false;
01090
01091 CRegExecutor executor( this, string );
01092
01093
01094 if ( reganch )
01095 return( executor.regtry( string ) );
01096
01097
01098 if ( regstart != '\0' )
01099 {
01100
01101 for ( LPTSTR s = string; s != NULL; s = _tcschr( s+1 , regstart ) )
01102 if ( executor.regtry( s) )
01103 return true;
01104 return false;
01105 }
01106 else
01107 {
01108
01109 for ( LPTSTR s = string; ! executor.regtry( s ); s++ )
01110 if (*s == '\0')
01111 return false;
01112 }
01113 return true;
01114 }
01115
01116
01117 bool CRegExecutor::regtry( LPTSTR string )
01118 {
01119 int i;
01120 LPTSTR * stp;
01121 LPTSTR * enp;
01122
01123 reginput = string;
01124
01125 stp = prog->startp;
01126 enp = prog->endp;
01127 for (i = Regexp::NSUBEXP; i > 0; i--)
01128 {
01129 *stp++ = NULL;
01130 *enp++ = NULL;
01131 }
01132 if ( regmatch( prog->program + 1 ) )
01133 {
01134 prog->startp[0] = string;
01135 prog->endp[0] = reginput;
01136 return true;
01137 }
01138 else
01139 return false;
01140 }
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151 bool CRegExecutor::regmatch( LPTSTR prog )
01152 {
01153 LPTSTR scan;
01154 LPTSTR next;
01155
01156 #ifdef _RE_DEBUG
01157 if (prog != NULL && regnarrate)
01158 _ftprintf(stderr, _T( "%s(\n" ), regprop(prog));
01159 #endif
01160 for (scan = prog; scan != NULL; scan = next)
01161 {
01162 #ifdef _RE_DEBUG
01163 if (regnarrate)
01164 _ftprintf(stderr, _T( "%s...\n" ), regprop(scan));
01165 #endif
01166 next = regnext(scan);
01167
01168 switch (OP(scan))
01169 {
01170 case BOL:
01171 if (reginput != regbol)
01172 return false;
01173 break;
01174 case EOL:
01175 if (*reginput != '\0')
01176 return false;
01177 break;
01178 case WORDA:
01179
01180 if ((!isalnum(*reginput)) && *reginput != '_')
01181 return(0);
01182
01183 if (reginput > regbol &&
01184 (isalnum(reginput[-1]) || reginput[-1] == '_'))
01185 return(0);
01186 break;
01187 case WORDZ:
01188
01189 if (isalnum(*reginput) || *reginput == '_')
01190 return(0);
01191
01192 break;
01193 case ANY:
01194 if (*reginput == '\0')
01195 return false;
01196 reginput++;
01197 break;
01198 case EXACTLY:
01199 {
01200 size_t len;
01201 LPTSTR const opnd = OPERAND(scan);
01202
01203
01204 if (*opnd != *reginput)
01205 return false;
01206 len = _tcslen(opnd);
01207 if (len > 1 && _tcsncmp(opnd, reginput, len) != 0)
01208 return false;
01209 reginput += len;
01210
01211 break;
01212 }
01213 case ANYOF:
01214 if (*reginput == '\0' ||
01215 _tcschr(OPERAND(scan), *reginput) == NULL)
01216 return false;
01217 reginput++;
01218 break;
01219 case ANYBUT:
01220 if (*reginput == '\0' ||
01221 _tcschr(OPERAND(scan), *reginput) != NULL)
01222 return false;
01223 reginput++;
01224 break;
01225 case NOTHING:
01226 break;
01227 case BACK:
01228 break;
01229 case OPEN+1: case OPEN+2: case OPEN+3:
01230 case OPEN+4: case OPEN+5: case OPEN+6:
01231 case OPEN+7: case OPEN+8: case OPEN+9:
01232 {
01233 const int no = OP(scan) - OPEN;
01234 LPTSTR const input = reginput;
01235
01236 if (regmatch(next))
01237 {
01238
01239
01240
01241
01242 if (regstartp[no] == NULL)
01243 regstartp[no] = input;
01244 return true;
01245 }
01246 else
01247 return false;
01248 break;
01249 }
01250 case CLOSE+1: case CLOSE+2: case CLOSE+3:
01251 case CLOSE+4: case CLOSE+5: case CLOSE+6:
01252 case CLOSE+7: case CLOSE+8: case CLOSE+9:
01253 {
01254 const int no = OP(scan) - CLOSE;
01255 LPTSTR const input = reginput;
01256
01257 if (regmatch(next))
01258 {
01259
01260
01261
01262
01263 if (regendp[no] == NULL)
01264 regendp[no] = input;
01265 return true;
01266 }
01267 else
01268 return false;
01269 break;
01270 }
01271 case BRANCH:
01272 {
01273 LPTSTR const save = reginput;
01274
01275 if (OP(next) != BRANCH)
01276 next = OPERAND(scan);
01277 else
01278 {
01279 while (OP(scan) == BRANCH)
01280 {
01281 if (regmatch(OPERAND(scan)))
01282 return true;
01283 reginput = save;
01284 scan = regnext(scan);
01285 }
01286 return false;
01287
01288 }
01289 break;
01290 }
01291 case STAR: case PLUS:
01292 {
01293 const TCHAR nextch = (OP(next) == EXACTLY) ? *OPERAND(next) : '\0';
01294 size_t no;
01295 LPTSTR const save = reginput;
01296 const size_t min = (OP(scan) == STAR) ? 0 : 1;
01297
01298 for (no = regrepeat(OPERAND(scan)) + 1; no > min; no--)
01299 {
01300 reginput = save + no - 1;
01301
01302 if (nextch == '\0' || *reginput == nextch)
01303 if (regmatch(next))
01304 return true;
01305 }
01306 return false;
01307 break;
01308 }
01309 case END:
01310 return true;
01311 break;
01312 default:
01313 regerror( REGERR_CORRUPTION );
01314 return false;
01315 break;
01316 }
01317 }
01318
01319
01320
01321
01322 regerror( REGERR_CORRUPTED_POINTERS );
01323 return false;
01324 }
01325
01326
01327
01328 size_t CRegExecutor::regrepeat( LPTSTR node )
01329 {
01330 size_t count;
01331 LPTSTR scan;
01332 TCHAR ch;
01333
01334 switch (OP(node))
01335 {
01336 case ANY:
01337 return(_tcslen(reginput));
01338 break;
01339 case EXACTLY:
01340 ch = *OPERAND(node);
01341 count = 0;
01342 for (scan = reginput; *scan == ch; scan++)
01343 count++;
01344 return(count);
01345 break;
01346 case ANYOF:
01347 return(_tcsspn(reginput, OPERAND(node)));
01348 break;
01349 case ANYBUT:
01350 return(_tcscspn(reginput, OPERAND(node)));
01351 break;
01352 default:
01353 regerror( REGERR_BAD_REGREPEAT );
01354 return(0);
01355 break;
01356 }
01357
01358 }
01359
01360 #ifdef _RE_DEBUG
01361
01362
01363
01364 void regexp::regdump()
01365 {
01366 LPTSTR s;
01367 TCHAR op = EXACTLY;
01368 LPTSTR next;
01369
01370 s = _tcsinc(program);
01371 while (op != END)
01372 {
01373 op = OP(s);
01374 _tprintf(_T( "%2d%s" ), s-program, regprop(s));
01375 next = regnext(s);
01376 if (next == NULL)
01377 _tprintf(_T( "(0)" ));
01378 else
01379 _tprintf(_T( "(%d)" ), (s-program)+(next-s));
01380 s += 3;
01381 if (op == ANYOF || op == ANYBUT || op == EXACTLY)
01382 {
01383
01384 while (*s != '\0')
01385 {
01386 _puttchar(*s);
01387 s = _tcsinc(s);
01388 }
01389 s = _tcsinc(s);
01390 }
01391 _puttchar('\n');
01392 }
01393
01394
01395 if (regstart != '\0')
01396 _tprintf(_T( "start `%c' " ), regstart);
01397 if (reganch)
01398 _tprintf(_T( "anchored " ));
01399 if (regmust != NULL)
01400 _tprintf(_T( "must have \"%s\"" ), regmust);
01401 _tprintf(_T( "\n" ));
01402 }
01403
01404
01405
01406 #define OUTPUT(s) case s: p = _T( #s ); break
01407 LPTSTR CRegProgramAccessor::regprop( LPTSTR op )
01408 {
01409 LPTSTR p;
01410 static TCHAR buf[50];
01411
01412 (void) _tcscpy(buf, _T( ":" ));
01413
01414 switch (OP(op))
01415 {
01416 OUTPUT( BOL );
01417 OUTPUT( EOL );
01418 OUTPUT( ANY );
01419 OUTPUT( ANYOF );
01420 OUTPUT( ANYBUT );
01421 OUTPUT( BRANCH );
01422 OUTPUT( EXACTLY );
01423 OUTPUT( NOTHING );
01424 OUTPUT( BACK );
01425 OUTPUT( END );
01426 OUTPUT( STAR );
01427 OUTPUT( PLUS );
01428 OUTPUT( WORDA );
01429 OUTPUT( WORDZ );
01430 case OPEN+1: case OPEN+2: case OPEN+3:
01431 case OPEN+4: case OPEN+5: case OPEN+6:
01432 case OPEN+7: case OPEN+8: case OPEN+9:
01433 _stprintf(buf+_tcslen(buf), _T( "OPEN%d" ), OP(op)-OPEN);
01434 p = NULL;
01435 break;
01436 case CLOSE+1: case CLOSE+2: case CLOSE+3:
01437 case CLOSE+4: case CLOSE+5: case CLOSE+6:
01438 case CLOSE+7: case CLOSE+8: case CLOSE+9:
01439 _stprintf(buf+_tcslen(buf), _T( "CLOSE%d" ), OP(op)-CLOSE);
01440 p = NULL;
01441 break;
01442 default:
01443 regerror( REGERR_CORRUPTED_OPCODE );
01444 break;
01445 }
01446 if (p != NULL)
01447 (void) _tcscat(buf, p);
01448 return(buf);
01449 }
01450 #endif
01451
01453
01454 Regexp::Regexp()
01455 : rc(0),
01456 string(0)
01457 {
01458 }
01459
01460 Regexp::Regexp( LPCTSTR exp, BOOL iCase )
01461 : rc( new regexp( exp, iCase ) ),
01462 string( 0 )
01463 {
01464 }
01465
01466 Regexp::Regexp( const Regexp &r )
01467 : rc( r.rc ),
01468 m_szError(r.m_szError),
01469 string(r.string)
01470 {
01471 if ( rc )
01472 rc->count++;
01473 }
01474
01475 const Regexp & Regexp::operator=( const Regexp & r )
01476 {
01477 if ( this != &r )
01478 {
01479 if ( rc && rc->count-- == 0 )
01480 delete rc;
01481
01482 rc = r.rc;
01483 if ( rc )
01484 rc->count++;
01485
01486 string = r.string;
01487 m_szError = r.m_szError;
01488 }
01489 return *this;
01490 }
01491
01492 Regexp::~Regexp()
01493 {
01494 if ( rc && rc->count-- == 0 )
01495 delete rc;
01496 }
01497
01498 bool Regexp::Match( const TCHAR * s )
01499 {
01500 ClearErrorString();
01501 string = s;
01502 bool ret = false;
01503 if ( rc )
01504 {
01505
01506
01507 if ( rc->count )
01508 {
01509 rc->count--;
01510 rc = rc->getCopy();
01511 }
01512
01513 ret = rc->regexec( s );
01514 int i = 0;
01515 if ( ret )
01516 for ( i = 0; i < Regexp::NSUBEXP && rc->startp[i] ; i++ )
01517 ;
01518 rc->numSubs = i - 1;
01519 }
01520 else
01521 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01522 return ret;
01523 }
01524
01525 CString Regexp::GetReplaceString( LPCTSTR source ) const
01526 {
01527 ClearErrorString();
01528 if ( rc )
01529 return rc->GetReplaceString( source );
01530 else
01531 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01532 return _T( "" );
01533 }
01534
01535 int Regexp::SubStrings() const
01536 {
01537 ClearErrorString();
01538 int ret = -1;
01539 if ( rc )
01540 ret = rc->numSubs;
01541 else
01542 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01543 return ret;
01544 }
01545
01546 int Regexp::SubStart( unsigned int i ) const
01547 {
01548 ClearErrorString();
01549 int ret = -1;
01550 if ( rc )
01551 ret = rc->startp[safeIndex(i)] - string;
01552 else
01553 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01554 return ret;
01555 }
01556
01557 int Regexp::SubLength( unsigned int i ) const
01558 {
01559 ClearErrorString();
01560 int ret = -1;
01561 if ( rc )
01562 {
01563 i = safeIndex(i);
01564 ret = rc->endp[i] - rc->startp[i];
01565 }
01566 else
01567 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01568 return ret;
01569 }
01570
01571 bool Regexp::CompiledOK() const
01572 {
01573 return rc ? rc->Status() : false;
01574 }
01575
01576 #ifdef _RE_DEBUG
01577 void Regexp::Dump()
01578 {
01579 if ( rc )
01580 rc->regdump();
01581 #if defined( _DEBUG )
01582 else
01583 TRACE0( "No regexp to dump out\n" );
01584 #endif
01585 }
01586 #endif
01587
01588 int Regexp::safeIndex( unsigned int i ) const
01589 {
01590 return i < Regexp::NSUBEXP ? i : Regexp::NSUBEXP;
01591 }
01592
01593 const CString Regexp::operator[]( unsigned int i ) const
01594 {
01595 ClearErrorString();
01596 ASSERT( rc );
01597 if ( rc )
01598 {
01599 CString buffer;
01600 int len = SubLength(i);
01601 TCHAR * szbuf = buffer.GetBufferSetLength( len );
01602 memcpy( szbuf, rc->startp[i], len * sizeof(TCHAR) );
01603 buffer.ReleaseBuffer();
01604 return buffer;
01605 }
01606 else
01607 {
01608 m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP );
01609 return "";
01610 }
01611 }
01612
01613 void regexp::ignoreCase( const TCHAR * in, TCHAR * out )
01614 {
01615
01616 BOOL inRange = FALSE;
01617 while( *in )
01618 {
01619 if ( *in == '[' )
01620 inRange = TRUE;
01621 if ( *in == ']' )
01622 inRange = FALSE;
01623 if ( ! inRange && isalpha( *in ) )
01624 {
01625 *out++ = '[';
01626 *out++ = (TCHAR)toupper( *in );
01627 *out++ = (TCHAR)tolower( *in );
01628 *out++ = ']';
01629 }
01630 else
01631 *out++ = *in;
01632 in++;
01633 }
01634 *out = 0;
01635 }
01636
01637
01638
01639
01640 CString regexp::GetReplaceString( const TCHAR* sReplaceExp ) const
01641 {
01642 CString szEmpty( _T( "" ) );
01643
01644 TCHAR *src = (TCHAR *)sReplaceExp;
01645 TCHAR *buf;
01646 TCHAR c;
01647 int no;
01648 size_t len;
01649
01650 if( sReplaceExp == NULL )
01651 {
01652 regerror( REGERR_NULL_TO_REGSUB );
01653 return szEmpty;
01654 }
01655 if ( *program != MAGIC)
01656 {
01657 regerror( REGERR_DAMAGED_REGEXP_REGSUB );
01658 return szEmpty;
01659 }
01660
01661
01662 int replacelen = 0;
01663 while ((c = *src++) != _T('\0'))
01664 {
01665 if (c == _T('&'))
01666 no = 0;
01667 else if (c == _T('\\') && isdigit(*src))
01668 no = *src++ - _T('0');
01669 else
01670 no = -1;
01671
01672 if (no < 0)
01673 {
01674
01675 if (c == _T('\\') && (*src == _T('\\') || *src == _T('&')))
01676 c = *src++;
01677 replacelen++;
01678 }
01679 else if (startp[no] != NULL && endp[no] != NULL &&
01680 endp[no] > startp[no])
01681 {
01682
01683 len = endp[no] - startp[no];
01684 replacelen += len;
01685 }
01686 }
01687
01688 CString szReplace;
01689
01690 buf = szReplace.GetBufferSetLength( replacelen );
01691
01692
01693 src = (TCHAR *)sReplaceExp;
01694 while ((c = *src++) != _T('\0'))
01695 {
01696 if (c == _T('&'))
01697 no = 0;
01698 else if (c == _T('\\') && isdigit(*src))
01699 no = *src++ - _T('0');
01700 else
01701 no = -1;
01702
01703 if (no < 0)
01704 {
01705
01706 if (c == _T('\\') && (*src == _T('\\') || *src == _T('&')))
01707 c = *src++;
01708 *buf++ = c;
01709 }
01710 else if (startp[no] != NULL && endp[no] != NULL &&
01711 endp[no] > startp[no])
01712 {
01713
01714 len = endp[no] - startp[no];
01715 _tcsncpy(buf, startp[no], len);
01716 buf += len;
01717 if (len != 0 && *(buf-1) == _T( '\0' ))
01718 {
01719 regerror( REGERR_DAMAGED_MATCH_STRING );
01720 return szEmpty;
01721 }
01722 }
01723 }
01724
01725 szReplace.ReleaseBuffer( replacelen );
01726 return szReplace;
01727 }
01728
01729 CString Regexp::GetErrorString() const
01730 {
01731
01732 ASSERT( ( ! CompiledOK() ) ? ( rc ? rc->GetErrorString() : m_szError).GetLength() != 0 : 1 );
01733 return rc ? rc->GetErrorString() : m_szError ;
01734 }
01735
01736 void Regexp::ClearErrorString() const
01737 {
01738 if ( rc )
01739 rc->ClearErrorString();
01740 m_szError.Empty();
01741 }
01742