#ifndef lint static char sccsid[] = "@(#)regexp.c 1.2 (LBL) 12/4/85"; static char rcsid[] = "$Id: regexp.c,v 1.3 1999/05/27 16:17:43 mike Exp $"; #endif /* * Copyright %%\copyright%% 1980 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Regular expression matching routines for lgrind/tgrind/tfontedpr. * * These routines were written by Dave Presotto (I think) for vgrind. * Minor mods & attempts to improve performance by Van Jacobson * (van@@lbl-rtsg) and Chris Torek (chris@@maryland). * * Modifications. * -------------- * Sep 91 George V Reilly Fixed up some bogus uses of @NIL@ and @NULL@. * 30 Mar 85 Van & Chris Changed @expmatch()@ to return pointer to * start of what was matched in addition to * pointer to match end. Several changes to * improve performance (too numerous to mention). * 11 Dec 84 Dave Presotto Written. */ #include #include #include #include #include "regexp.h" #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) extern char *l_id; static void expconv(void); /* forward declaration */ int (*re_strncmp)(const char *, const char *, size_t); /* function used by @expmatch()@ to compare * strings. The caller should make it point to * @strncmp()@ if case is significant & * @lc_strncmp()@ otherwise. */ /* @lc_strncmp()@ --- like @strncmp()@ except that we convert the * first string to lower case before comparing. */ int lc_strncmp(register const char *s1, register const char *s2, register size_t len) { while (len-- > 0) if (*s2 - makelower(*s1)) return 1; else s2++, s1++; return 0; } /* The following routine converts an irregular expression to * internal format. * * Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings * or operations (alternation or parenthesizing) can be specified. * Each starts with a descriptor byte. The descriptor byte * has @STR@ set for strings, @META@ set for meta symbols, * and @OPER@ set for operations. The descriptor byte can also have * the @OPT@ bit set if the object defined is optional. Also @ALT@ * can be set to indicate an alternation. * * For metasymbols, the byte following the descriptor byte identities * the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@, * or @'('@). For strings, the byte after the descriptor is a * character count for the string: *@ * meta symbols := descriptor * symbol * * strings := descriptor * character count * the string * * operations := descriptor * symbol * character count@ */ /* * handy macros for accessing parts of match blocks */ #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ #define MNEXT(A) (A+2) /* character following a metasymbol block */ #define OSYM(A) (*(A+1)) /* symbol in an operation block */ #define OCNT(A) (*(A+2)) /* character count */ #define ONEXT(A) (A+3) /* next character after the operation */ #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ #define SCNT(A) (*(A+1)) /* byte count of a string */ #define SSTR(A) (A+2) /* address of the string */ #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ /* * bit flags in the descriptor */ #define OPT 1 #define STR 2 #define META 4 #define ALT 8 #define OPER 16 char *ure; /* pointer current position in unconverted exp */ char *ccre; /* pointer to current position in converted exp */ /* * Convert an irregular expression to the internal form */ char *convexp(char *re) /* re unconverted irregular expression */ { register char *cre; /* pointer to converted regular expression */ /* allocate room for the converted expression */ if (re == NULL) return NULL; if (*re == '\0') return NULL; cre = (char*)malloc(4 * strlen(re) + 3); ccre = cre; ure = re; /* start the conversion with a %|\a|% */ *cre = META | OPT; MSYM(cre) = 'a'; ccre = MNEXT(cre); /* start the conversion (it's recursive) */ expconv(); *ccre = 0; return cre; } /* * Actually do the conversion */ static void expconv(void) { register char *cs; /* pointer to current symbol in converted exp */ register char c; /* character being processed */ register char *acs; /* pointer to last alternate */ register int temp; /* let the conversion begin */ acs = NULL; cs = NULL; while (*ure != '\0') { switch (c = *ure++) { case '\\': switch (c = *ure++) { /* escaped characters are just characters */ default: if (cs == NULL || (*cs & STR) == 0) { cs = ccre; *cs = STR; SCNT(cs) = 1; ccre += 2; } else SCNT(cs)++; *ccre++ = c; break; /* normal(?) metacharacters */ case 'a': case 'd': case 'e': case 'p': if (acs != NULL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NULL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); break; } break; /* just put the symbol in */ case '^': case '$': if (acs != NULL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NULL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); break; /* mark the last match sequence as optional */ case '?': if (cs) *cs = *cs | OPT; break; /* recurse and define a subexpression */ case '(': if (acs != NULL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NULL; } cs = ccre; *cs = OPER; OSYM(cs) = '('; ccre = ONEXT(cs); expconv(); OCNT(cs) = ccre - cs; /* offset to next symbol */ break; /* return from a recursion */ case ')': if (acs != NULL) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NULL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); return; /* Mark the last match sequence as having an alternate. */ /* The third byte will contain an offset to jump over the */ /* alternate match in case the first did not fail */ case '|': if (acs != NULL && acs != cs) OCNT(ccre) = ccre - acs; /* make a back pointer */ else OCNT(ccre) = 0; *cs |= ALT; cs = ccre; *cs = OPER; OSYM(cs) = '|'; ccre = ONEXT(cs); acs = cs; /* remember that the pointer is to be filled */ break; /* if it's not a metasymbol, just build a character string */ default: if (cs == NULL || (*cs & STR) == 0) { cs = ccre; *cs = STR; SCNT(cs) = 1; ccre = SSTR(cs); } else SCNT(cs)++; *ccre++ = c; break; } } if (acs != NULL) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NULL; } return; } /* end of @expconv()@ */ /* * The following routine recognises an irregular expression * with the following special characters: * * %|\?|% means last match was optional * %|\a|% matches any number of characters * %|\d|% matches any number of spaces and tabs * %|\p|% matches any number of alphanumeric characters. * The characters matched will be copied into * the area pointed to by @mstring@. * %|\||% alternation * %|\( \)|% grouping used mostly for alternation and * optionality * * The irregular expression must be translated to internal form * prior to calling this routine * * The value returned is the pointer to the last character matched. * If @strtptr@ is non-@NULL@, a pointer to the start of the string matched * (excluding %|\a|% matches) will be returned at @*strtptr@. * If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied * into @mstring@. */ boolean _escaped; /* true if we are currently @_escaped@ */ char *_sstart; /* start of string. Set in @putScp()@. */ char *expmatch( register char *s, /* string to check for a match in */ register char *re, /* a converted irregular expression */ char **strtptr, /* where to put ptr to start of match */ char *mstring) /* where to put whatever matches a %|\p|% */ { register char *cs; /* the current symbol */ register char *ptr, *s1; /* temporary pointer */ register char c; /* temporary */ boolean matched; /* a temporary @boolean@ */ /* initial conditions */ if (strtptr) *strtptr = '\0'; if (re == NULL) return NULL; cs = re; matched = FALSE; /* loop till expression string is exhausted (or at least pretty tired) */ while (*cs) { switch (*cs & (OPER | STR | META)) { /* try to match a string */ case STR: matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs))); if (matched) { /* hoorah: it matches */ s += SCNT(cs); cs = SNEXT(cs); } else if (*cs & ALT) { /* alternation: skip to next expression */ cs = SNEXT(cs); } else if (*cs & OPT) { /* the match is optional */ cs = SNEXT(cs); matched = 1; /* indicate a successful match */ } else { /* no match: error return */ return NULL; } break; /* an operator: do something fancy */ case OPER: switch (OSYM(cs)) { /* this is an alternation */ case '|': if (matched) /* last thing in the alternation was a match: skip ahead */ cs = OPTR(cs); else /* no match: keep trying */ cs = ONEXT(cs); break; /* this is a grouping: recurse */ case '(': ptr = expmatch(s, ONEXT(cs), strtptr, mstring); if (ptr != NULL) { /* the subexpression matched */ matched = 1; s = ptr; } else if (*cs & ALT) { /* alternation: skip to next expression */ matched = 0; } else if (*cs & OPT) { /* the match is optional */ matched = 1; /* indicate a successful match */ } else { /* no match: error return */ return NULL; } cs = OPTR(cs); break; } break; /* try to match a metasymbol */ case META: switch (MSYM(cs)) { /* try to match anything and remember what was matched */ case 'p': /* * This is really the same as trying the match the * remaining parts of the expression to any subset * of the string. */ s1 = s; do { ptr = expmatch(s1, MNEXT(cs), strtptr, mstring); if (ptr != NULL && s1 != s) { /* we have a match: remember the match */ if (mstring) { strncpy(mstring, s, (size_t)(s1 - s)); mstring[(int)(s1 - s)] = '\0'; } return ptr; } else if (ptr != NULL && (*cs & OPT)) { /* %|\p|% was optional, so no match is ok */ return ptr; } else if (ptr != NULL) { /* not optional and we still matched */ return NULL; } if (!isalnum(*s1) && !strchr(l_id, *s1)) return NULL; if (*s1 == '\\') _escaped = _escaped ? FALSE : TRUE; else _escaped = FALSE; } while (*s1++); return NULL; /* try to match anything */ case 'a': /* * This is really the same as trying the match the * remaining parts of the expression to any subset * of the string. */ s1 = s; do { /* * Hack for an important special case: if the next thing * in the pattern is a string, just gobble characters until * we find something that matches that string (this saves * the cost of a recursive call on @expmatch()@ while scanning * for the start of comments or strings). Since many * patterns end with a string, we also treat that as a * special case. */ if(*(ptr = MNEXT(cs)) == STR) { c = *SSTR(ptr); while(*s1 && *s1 != c) s1++; if (*s1 == 0) return NULL; if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) { /* next item is a string, it's the last item and * the %|\a|% match is ok --- just loop to try & match * the string. */ if (strtptr) *strtptr = s1; cs = ptr; s = s1; break; } } ptr = expmatch(s1, MNEXT(cs), strtptr, mstring); if (ptr != NULL && (s1 != s || *cs & OPT)) { /* we have a match */ if (strtptr) *strtptr = s1; return ptr; } else if (ptr != NULL) { /* not optional and we still matched */ return NULL; } if (*s1 == '\\') _escaped = _escaped ? FALSE : TRUE; else _escaped = FALSE; } while (*s1++); return NULL; /* fail if we are currently @_escaped@ */ case 'e': if (_escaped) return NULL; cs = MNEXT(cs); break; /* match any number of tabs and spaces */ case 'd': ptr = s; while (*s == ' ' || *s == '\t') s++; if (s != ptr || s == _sstart) { /* match: be happy */ matched = 1; cs = MNEXT(cs); } else if (*s == '\n' || *s == '\0') { /* match: be happy */ matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match: error return */ return NULL; break; /* check for end of line */ case '$': if (*s == '\0' || *s == '\n') { /* match: be happy */ /* s++; was here * removed; now $ is left in stream to be matched again, which * is not bad (EOL can end several things at once. Also it * leaves things like CE on the same line. */ matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match: error return */ return NULL; break; /* check for start of line */ case '^': if (s == _sstart) { /* match: be happy */ matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match: error return */ return NULL; break; /* end of a subexpression: return success */ case ')': return s; } break; } } return s; }