% \documentstyle[noweb]{article} % % \setlength{\oddsidemargin}{0in} % \setlength{\evensidemargin}{0in} % \setlength{\topmargin}{0in} % \addtolength{\topmargin}{-\headheight} % \addtolength{\topmargin}{-\headsep} % \setlength{\textheight}{8.9in} % \setlength{\textwidth}{6.5in} % \setlength{\marginparwidth}{0.5in} \subsection{An Efficient String Matcher (by Preston Briggs)} \label{preston} \subsubsection{Introduction} The obvious approach to this problem would be quite expensive for large documents; however, there is an interesting paper describing an efficient solution~\cite{aho:efficient}. \paragraph{Boilerplate} \indent\null\par <<*>>= static char rcsid[] = "$Id: recognize.nw,v 1.24 2008/10/06 01:03:05 nr Exp nr $"; static char rcsname[] = "$Name: v2_12 $"; static struct keepalive { char *s; struct keepalive *p; } keepalive[] = { {rcsid, keepalive}, {rcsname, keepalive} }; /* avoid warnings */ <> <

> <> <> <> @ <

>= <> <> @ <>= #include #include @ \paragraph{External Interface} The externally visible interface was designed by Norman Ramsey. We assume that [[alphanum]] and [[symbols]] point to constant strings; {\sl i.e.,} we don't bother to copy them into separately allocated space. <>= Recognizer new_recognizer(char *alphanum, char *symbols); <>= typedef struct recognizer *Recognizer; @ A copy is made of the string pointed to by [[id]]. It won't hurt to add the same identifier multiple times to a given recognizer. <>= void add_ident(Recognizer r, char *id); void stop_adding(Recognizer r); <>= void search_for_ident(Recognizer r, char *input, Callback f, void *closure); @ [[instance]] is a pointer to the place within [[input]] that we saw the identifier. <>= typedef void (*Callback) (void *closure, char *id, char *instance); @ \subsubsection{Defining the Automata} <>= typedef struct goto_node Goto_Node; typedef struct move_node Move_Node; <>= typedef struct name_node { struct name_node *next; /* points to the next name on the output list */ char *name; } Name_Node; <>= struct move_node { Move_Node *next; /* points to the next node on the move list */ Goto_Node *state; /* the next state for this character */ unsigned char c; }; <>= struct goto_node { Name_Node *output; /* list of words ending in this state */ Move_Node *moves; /* list of possible moves */ Goto_Node *fail; /* and where to go when no move fits */ Goto_Node *next; /* next goto node with same depth */ }; <>= struct recognizer { Goto_Node *root[256]; /* might want 128, depending on the character set */ char *alphas; char *syms; int max_depth; Goto_Node **depths; /* an array, max_depth long, of lists of goto_nodes, created while adding ids, used while building the failure functions */ }; @ \paragraph{A Utility Function} We need a function that, given the current state and a character, will return the next state as directed by the ``goto table.'' If there is no defined entry in the table, the function returns [[NULL]]. <>= static Goto_Node *goto_lookup(unsigned char c, Goto_Node *g) { Move_Node *m = g->moves; while (m && m->c != c) m = m->next; return m ? m->state : NULL; } @ \subsubsection{Building the Automata} The [[max_depth]] should be initialized to be at least 2. <>= Recognizer new_recognizer(char *alphanum, char *symbols) { Recognizer r = (Recognizer) calloc(1, sizeof(struct recognizer)); r->alphas = alphanum; r->syms = symbols; r->max_depth = 10; r->depths = (Goto_Node **) calloc(r->max_depth, sizeof(Goto_Node *)); return r; } @ \paragraph{Building the Goto Table} We assume [[id]] is at least 1 character long. <>= void add_ident(Recognizer r, char *id) { int depth = 2; char *p = id; unsigned char c = *p++; Goto_Node *q = r->root[c]; if (!q) <> c = *p++; while (c) { Goto_Node *new = goto_lookup(c, q); if (!new) <> q = new; depth++; c = *p++; } <output]] to [[id]] (if not already present)>> } <>= { q = (Goto_Node *) calloc(1, sizeof(Goto_Node)); r->root[c] = q; q->next = r->depths[1]; r->depths[1] = q; } <>= { Move_Node *new_move = (Move_Node *) malloc(sizeof(Move_Node)); new = (Goto_Node *) calloc(1, sizeof(Goto_Node)); new_move->state = new; new_move->c = c; new_move->next = q->moves; q->moves = new_move; if (depth == r->max_depth) <> new->next = r->depths[depth]; r->depths[depth] = new; } <>= { int i; Goto_Node **new_depths = (Goto_Node **) calloc(2*depth, sizeof(Goto_Node *)); r->max_depth = 2 * depth; for (i=0; idepths[i]; free(r->depths); r->depths = new_depths; } <output]] to [[id]] (if not already present)>>= if (!q->output) { char *copy = malloc(strlen(id) + 1); strcpy(copy, id); q->output = (Name_Node *) malloc(sizeof(Name_Node)); q->output->next = NULL; q->output->name = copy; } @ %\newpage \paragraph{Building the Failure Functions} After all the strings have been added to the goto table, we can construct the failure functions. It's going to be hard to explain this one. <>= void stop_adding(Recognizer r) { int depth; for (depth=1; depthmax_depth; depth++) { Goto_Node *g = r->depths[depth]; while (g) { Move_Node *m = g->moves; while (m) { unsigned char a = m->c; Goto_Node *s = m->state; Goto_Node *state = g->fail; while (state && !goto_lookup(a, state)) state = state->fail; if (state) s->fail = goto_lookup(a, state); else s->fail = r->root[a]; if (s->fail) { Name_Node *p = s->fail->output; while (p) { Name_Node *q = (Name_Node *) malloc(sizeof(Name_Node)); q->name = p->name; /* depending on memory deallocation strategy, we may need to copy this */ q->next = s->output; s->output = q; p = p->next; } } m = m->next; } g = g->next; } } } @ \subsubsection{Using the Automata} <>= void search_for_ident(Recognizer r, char *input, Callback f, void *closure) { Goto_Node *state = NULL; char *current = input; unsigned char c = (unsigned char) *current++; while (c) { <> <> c = *current++; } } @ This is all complicated by my use of [[NULL]] to indicate the initial state. However, we get a nice speedup by using the [[root]] array instead of walking down the move list for every character. <>= { while (state && !goto_lookup(c, state)) state = state->fail; state = state ? goto_lookup(c, state) : r->root[c]; } @ We walk down the output list, calling [[f]] with each name that is not rejected (see the next section). <>= { if (state) { Name_Node *p = state->output; while (p) { if (!reject_match(r, p->name, input, current)) f(closure, p->name, current - strlen(p->name)); p = p->next; } } } @ % \newpage \paragraph{Rejecting Matches} A problem with simple substring matching is that the string ``he'' would match longer strings like ``she'' and ``her.'' Norman Ramsey suggested examining the characters occuring immediately before and after a match and rejecting the match if it appears to be part of a longer token. Of course, the concept of {\sl token\/} is language-dependent, so we may be occasionally mistaken. For the present, we'll consider the mechanism an experiment. <>= int reject_match(Recognizer r, char *id, char *input, char *current) { int len = strlen(id); char first = id[0]; char last = id[len - 1]; char next = *current; char prev = '\0'; current = current - len - 1; if (input <= current) prev = *current; if (prev && strchr(r->alphas, first) && strchr(r->alphas, prev)) return 1; if (next && strchr(r->alphas, last ) && strchr(r->alphas, next)) return 1; if (prev && strchr(r->syms, first) && strchr(r->syms, prev)) return 1; if (next && strchr(r->syms, last ) && strchr(r->syms, next)) return 1; return 0; } @ Note we never reject a zero [[prev]] or [[next]], since some implementations of [[strchr]] always return true when searching for a zero character. @ We need a prototype for [[reject_match]], since it's referenced before its definition. <>= int reject_match(Recognizer r, char *id, char *input, char *current);