home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
- /* multipattern matcher */
- #include <stdio.h>
- #include <ctype.h>
- #include "agrep.h"
-
- #undef MAXPAT
- #define MAXPAT 256
-
- #define MAXLINE 1024
- #define MAXSYM 256
- #define MAXMEMBER1 4096
- #define MAXPATFILE 260000
- #define BLOCKSIZE 16384
- #define MAXHASH 8192
- #define mm 8191
- #define max_num 30000
- #define W_DELIM 128
- #define L_DELIM 10
-
- extern int LIMITOUTPUT;
- extern int MULTI_OUTPUT;
- extern COUNT, FNAME, SILENT, FILENAMEONLY, prev_num_of_matched, num_of_matched;
- extern INVERSE;
- extern WORDBOUND, WHOLELINE, NOUPPER;
- extern unsigned char CurrentFileName[], Progname[];
- extern total_line;
- extern agrep_initialfd;
- extern int EXITONERROR;
- extern int PRINTPATTERN;
- extern int agrep_inlen;
- extern CHAR *agrep_inbuffer;
- extern FILE *agrep_finalfp;
- extern int agrep_outpointer;
- extern int agrep_outlen;
- extern CHAR * agrep_outbuffer;
- extern int errno;
-
- extern int NEW_FILE, POST_FILTER;
-
- int LONG = 0;
- int SHORT = 0;
- int p_size=0;
- unsigned char SHIFT1[MAXMEMBER1];
- unsigned char tr[MAXSYM];
- unsigned char tr1[MAXSYM];
- struct pat_list {
- int index;
- struct pat_list *next;
- }
- *HASH[MAXHASH];
- struct pat_list *pt, *qt;
- unsigned char pat_spool[MAXPATFILE+2*max_num+MAXPAT];
- unsigned char *patt[max_num];
- unsigned char pat_len[max_num];
- int num_pat;
-
-
- prepf(mfp, mbuf, mlen)
- int mfp, mlen;
- unsigned char *mbuf;
- {
- int length=0, i, p=1, pdx=0;
- unsigned char *pat_ptr=pat_spool;
- unsigned Mask = 15;
- int num_read;
- unsigned char *buf;
-
- if ((mfp == -1) && ((mbuf == NULL) || (mlen <= 0))) return -1;
-
- if (mfp != -1) {
- alloc_buf(mfp, &buf, MAXPATFILE+2*BLOCKSIZE);
- while((num_read = fill_buf(mfp, buf+length, 2*BLOCKSIZE)) > 0) {
- length = length + num_read;
- if(length > MAXPATFILE) {
- fprintf(stderr, "%s: maximum pattern file size is %d\n", Progname, MAXPATFILE);
- if (!EXITONERROR) {
- errno = 2;
- free_buf(mfp, buf);
- return -1;
- }
- else exit(2);
- }
- }
- }
- else {
- buf = mbuf;
- length = mlen;
- }
-
- buf[length] = '\n';
-
- i = 0;
- p=1;
- while(i<length) {
- patt[p] = pat_ptr;
- if(WORDBOUND) *pat_ptr++ = W_DELIM;
- if(WHOLELINE) *pat_ptr++ = L_DELIM;
- while((*pat_ptr = buf[i++]) != '\n') pat_ptr++;
- if(WORDBOUND) *pat_ptr++ = W_DELIM;
- if(WHOLELINE) *pat_ptr++ = L_DELIM; /* Can't be both on */
- *pat_ptr++ = 0;
- p++;
- }
- if(p>max_num) {
- fprintf(stderr, "%s: maximum number of patterns is %d\n", Progname, max_num);
- if (!EXITONERROR) {
- errno = 2;
- free_buf(mfp, buf);
- return -1;
- }
- else exit(2);
- }
- for(i=1; i<20; i++) *pat_ptr = i; /* boundary safety zone */
- for(i=0; i< MAXSYM; i++) tr[i] = i;
- if(NOUPPER) {
- for (i=0; i<MAXSYM; i++)
- if (isupper(i)) tr[i] = tr[tolower(i)];
- /* for(i='A'; i<= 'Z'; i++) tr[i] = i + 'a' - 'A'; */
- }
- if(WORDBOUND) {
- for(i=0; i<128; i++) if(!isalnum(i)) tr[i] = W_DELIM;
- }
- for(i=0; i< MAXSYM; i++) tr1[i] = tr[i]&Mask;
- num_pat = p-1;
- p_size = MAXPAT;
- for(i=1 ; i <= num_pat; i++) {
- p = strlen(patt[i]);
- pat_len[i] = p;
- if(p!=0 && p < p_size) p_size = p;
- }
- if(p_size == 0) {
- fprintf(stderr, "%s: the pattern file is empty\n");
- if (!EXITONERROR) {
- errno = 2;
- free_buf(mfp, buf);
- return -1;
- }
- else exit(2);
- }
- if(length > 400 && p_size > 2) LONG = 1;
- if(p_size == 1) SHORT = 1;
- for(i=0; i<MAXMEMBER1; i++) SHIFT1[i] = p_size - 2;
- for(i=0; i<MAXHASH; i++) {
- HASH[i] = 0;
- }
- for(i=1; i<= num_pat; i++) f_prep(i, patt[i]);
-
- free_buf(mfp, buf);
- return 0;
- }
-
-
- mgrep(fd)
- int fd;
- {
- register char r_newline = '\n';
- unsigned char *text;
- register int buf_end, num_read, i=0, j, start, end, residue = 0;
-
- #if AGREP_POINTER
- if (fd != -1) {
- #endif /*AGREP_POINTER*/
- alloc_buf(fd, &text, 2*BLOCKSIZE+MAXLINE);
- text[MAXLINE-1] = '\n'; /* initial case */
- start = MAXLINE-1;
-
- while( (num_read = fill_buf(fd, text+MAXLINE, 2*BLOCKSIZE)) > 0)
- {
- if(INVERSE && COUNT) countline(text+MAXLINE, num_read);
- buf_end = end = MAXLINE + num_read -1 ;
- while(text[end] != r_newline && end > MAXLINE) end--;
- residue = buf_end - end + 1 ;
- text[start-1] = r_newline;
-
- /* MGREP_PROCESS */
- if(SHORT) { if (-1 == m_short(text, start, end)) {free_buf(fd, text); return -1;}}
- else { if (-1 == monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
- if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%s\n", CurrentFileName);
- else {
- int outindex;
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (CurrentFileName[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
- }
- if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer+1>=agrep_outlen)) {
- OUTPUT_OVERFLOW;
- free_buf(fd, text);
- return -1;
- }
- else {
- agrep_outbuffer[agrep_outpointer+outindex++] = '\n';
- }
- agrep_outpointer += outindex;
- }
- free_buf(fd, text);
- NEW_FILE = OFF;
- return 0;
- }
-
- start = MAXLINE - residue;
- if(start < 0) {
- start = 1;
- }
- strncpy(text+start, text+end, residue);
- } /* end of while(num_read = ...) */
- text[MAXLINE] = '\n';
- text[start-1] = '\n';
- if(residue > 1) {
- if(SHORT) m_short(text, start, end);
- else monkey1(text, start, end);
- }
- free_buf(fd, text);
- return 0;
- #if AGREP_POINTER
- }
- else {
- text = (unsigned char *)agrep_inbuffer;
- num_read = agrep_inlen;
- buf_end = end = num_read - 1;
- text[0] = text[end] = r_newline;
- start = 1;
- if (INVERSE && COUNT) countline(text, num_read);
-
- /* An exact copy of the above MGREP_PROCESS */
- if(SHORT) { if (-1 == m_short(text, start, end)) {free_buf(fd, text); return -1;}}
- else { if (-1 == monkey1(text, start, end)) {free_buf(fd, text); return -1;}}
- if(FILENAMEONLY && (num_of_matched - prev_num_of_matched) && (NEW_FILE || !POST_FILTER)) {
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%s\n", CurrentFileName);
- else {
- int outindex;
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (CurrentFileName[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
- }
- if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer+1>=agrep_outlen)) {
- OUTPUT_OVERFLOW;
- free_buf(fd, text);
- return -1;
- }
- else {
- agrep_outbuffer[agrep_outpointer+outindex++] = '\n';
- }
- agrep_outpointer += outindex;
- }
- free_buf(fd, text);
- NEW_FILE = OFF;
- return 0;
- }
-
- return 0;
- }
- #endif /*AGREP_POINTER*/
- } /* end mgrep */
-
- countline(text, len)
- unsigned char *text;
- int len;
- {
- int i;
- for (i=0; i<len; i++) if(text[i] == '\n') total_line++;
- }
-
-
- monkey1( text, start, end )
- int start, end;
- register unsigned char *text;
- {
- unsigned char *oldtext;
- register unsigned char *textend;
- register unsigned hash, i;
- register unsigned char shift;
- register int m1, j, Long=LONG;
- int pat_index, m=p_size;
- int MATCHED=0;
- register unsigned char *qx;
- register struct pat_list *p;
- unsigned char *lastout;
- int OUT=0;
-
- textend = text + end;
- m1 = m - 1;
- lastout = text+start+1;
- text = text + start + m1 ;
- while (text <= textend) {
- hash=tr1[*text];
- hash=(hash<<4)+(tr1[*(text-1)]);
- if(Long) hash=(hash<<4)+(tr1[*(text-2)]);
- shift = SHIFT1[hash];
- if(shift == 0) {
- for(i=2+Long;i<=m1;i++) {
- hash=(hash<<4)+(tr1[*(text-i)]);
- }
- hash=hash&mm;
- p = HASH[hash];
- while(p != 0) {
- pat_index = p->index;
- p = p->next;
- qx = text-m1;
- j = 0;
- while(tr[patt[pat_index][j]] == tr[*(qx++)]) j++;
- if (j > m1 ) {
- if((int)(pat_len[pat_index]) <= j) {
- if(text > textend) return 0;
- /* Udi: if -P (output the pattern # as well as the matched line) ->
- print(pat_index:) ========= bg: DONE, see below */
- num_of_matched ++;
- if(FILENAMEONLY || SILENT) return 0;
- MATCHED=1;
- oldtext = text; /* only for MULTI_OUTPUT */
-
- if(COUNT) {
- while (*text != '\n') text++;
- }
- else {
- if(FNAME && (NEW_FILE || !POST_FILTER)) {
- char nextchar = (POST_FILTER == ON)?'\n':' ';
- char *prevstring = (POST_FILTER == ON)?"\n":"";
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar);
- else {
- int outindex;
- if (prevstring[0] != '\0') {
- if(agrep_outpointer + 1 >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];
- }
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (CurrentFileName[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
- }
- if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer+2>=agrep_outlen)) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else {
- agrep_outbuffer[agrep_outpointer+outindex++] = ':';
- agrep_outbuffer[agrep_outpointer+outindex++] = nextchar;
- }
- agrep_outpointer += outindex;
- }
- NEW_FILE = OFF;
- }
-
- if (PRINTPATTERN) {
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%d- ", pat_index);
- else {
- char s[32];
- int outindex;
- sprintf(s, "%d- ", pat_index);
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (s[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
- }
- if (s[outindex] != '\0') {
- OUTPUT_OVERFLOW;
- return -1;
- }
- agrep_outpointer += outindex;
- }
- }
-
- if(!INVERSE) {
- while(*(--text) != '\n');
-
- if (agrep_finalfp != NULL) {
- while((text < textend) && (*(++text) != '\n')) fputc(*text, agrep_finalfp);
- fputc('\n', agrep_finalfp);
- }
- else {
- while ((text < textend) && (*(++text) != '\n') && (agrep_outpointer < agrep_outlen))
- agrep_outbuffer[agrep_outpointer ++] = *text;
- if ((*text != '\n') || (agrep_outpointer + 1 >= agrep_outlen)) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = '\n';
- }
-
- if (MULTI_OUTPUT) { /* next match starting from end of current */
- text = oldtext + pat_len[pat_index] - 1;
- MATCHED = 0;
- }
- }
- else {
- while(*(--text) != '\n');
- if(lastout < text) OUT=1;
- if (agrep_finalfp != NULL) {
- while(lastout < text) fputc(*lastout++, agrep_finalfp);
- }
- else {
- if (text - lastout + agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- memcpy(agrep_outbuffer+agrep_outpointer, lastout, text-lastout);
- agrep_outpointer += (text-lastout);
- lastout = text;
- }
- if(OUT) {
- if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);
- else if (agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = '\n';
- OUT=0;
- }
- while((text < textend) && (*(++text) != '\n'));
- lastout=text+1;
- } /* INVERSE */
- } /* COUNT */
- if ((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) return 0; /* done */
- } /* pat_len[pat_index] <= j */
- } /* j > m1 */
- if (MATCHED && !MULTI_OUTPUT) break; /* else look for more possible matches since we never know how many will match */
- }
- /* Udi: I think this should always be shift = 1 */
- if(!MATCHED) shift = 1;
- else {
- MATCHED = 0;
- shift = m1;
- }
- }
- text = text + shift;
- }
- if(INVERSE && !COUNT) {
- if (agrep_finalfp != NULL) {
- while(lastout <= textend) fputc(*lastout++, agrep_finalfp);
- }
- else {
- if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- memcpy(agrep_outbuffer+agrep_outpointer, lastout, textend-lastout+1);
- agrep_outpointer += (textend-lastout+1);
- lastout = textend;
- }
- }
-
- return 0;
- }
-
- m_short( text, start, end )
- int start, end;
- register unsigned char *text;
- {
- unsigned char *oldtext;
- register unsigned char *textend;
- register unsigned i;
- register int j;
- register struct pat_list *p, *q;
- register int pat_index;
- int MATCHED=0;
- int OUT=0;
- unsigned char *lastout;
- unsigned char *qx;
-
- textend = text + end;
- lastout = text + start + 1;
- text = text + start - 1 ;
- while (++text <= textend) {
- p = HASH[*text];
- while(p != 0) {
- pat_index = p->index;
- p = p->next;
- qx = text;
- j = 0;
- while(tr[patt[pat_index][j]] == tr[*(qx++)]) j++;
- if((int)(pat_len[pat_index]) <= j) {
- if(text >= textend) return 0;
- num_of_matched++;
- if(FILENAMEONLY || SILENT) return 0;
-
- if(COUNT) {
- while (*text != '\n') text++;
- }
- else {
- MATCHED = 1;
- oldtext = text; /* used only if MULTI_OUTPUT */
- if(FNAME && (NEW_FILE || !POST_FILTER)) {
- char nextchar = (POST_FILTER == ON)?'\n':' ';
- char *prevstring = (POST_FILTER == ON)?"\n":"";
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%s%s:%c", prevstring, CurrentFileName, nextchar);
- else {
- int outindex;
- if (prevstring[0] != '\0') {
- if(agrep_outpointer + 1 >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = prevstring[0];
- }
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (CurrentFileName[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = CurrentFileName[outindex];
- }
- if ((CurrentFileName[outindex] != '\0') || (outindex+agrep_outpointer+2>=agrep_outlen)) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else {
- agrep_outbuffer[agrep_outpointer+outindex++] = ':';
- agrep_outbuffer[agrep_outpointer+outindex++] = nextchar;
- }
- agrep_outpointer += outindex;
- }
- NEW_FILE = OFF;
- }
-
- if (PRINTPATTERN) {
- if (agrep_finalfp != NULL)
- fprintf(agrep_finalfp, "%d- ", pat_index);
- else {
- char s[32];
- int outindex;
- sprintf(s, "%d- ", pat_index);
- for(outindex=0; (outindex+agrep_outpointer<agrep_outlen) &&
- (s[outindex] != '\0'); outindex++) {
- agrep_outbuffer[agrep_outpointer+outindex] = s[outindex];
- }
- if (s[outindex] != '\0') {
- OUTPUT_OVERFLOW;
- return -1;
- }
- agrep_outpointer += outindex;
- }
- }
-
- if(!INVERSE) {
- while(*(--text) != '\n');
- if (agrep_finalfp != NULL) {
- while((text < textend) && (*(++text) != '\n')) fputc(*text, agrep_finalfp);
- fputc('\n', agrep_finalfp);
- }
- else {
- while ((text < textend) && (*(++text) != '\n') && (agrep_outpointer < agrep_outlen))
- agrep_outbuffer[agrep_outpointer ++] = *text;
- if ((*text != '\n') || (agrep_outpointer + 1 >= agrep_outlen)) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = '\n';
- }
- if (MULTI_OUTPUT) { /* next match starting from end of current */
- text = oldtext + pat_len[pat_index] - 1;
- MATCHED = 0;
- }
- }
- else {
- while(*(--text) != '\n');
- if(lastout < text) OUT=1;
-
- if (agrep_finalfp != NULL) {
- while(lastout < text) fputc(*lastout++, agrep_finalfp);
- }
- else {
- if (text - lastout + agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- memcpy(agrep_outbuffer+agrep_outpointer, lastout, text-lastout);
- agrep_outpointer += (text-lastout);
- lastout = text;
- }
- if(OUT) {
- if (agrep_finalfp != NULL) fputc('\n', agrep_finalfp);
- else if (agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- else agrep_outbuffer[agrep_outpointer ++] = '\n';
- OUT=0;
- }
- while((text < textend) && (*(++text) != '\n'));
- lastout=text+1;
- }
- } /* COUNT */
- if ((LIMITOUTPUT > 0) && (LIMITOUTPUT <= num_of_matched)) return 0; /* done */
- } /* pat_len[pat_index] <= j */
- if(MATCHED && !MULTI_OUTPUT) break; /* else look for more possible matches */
- }
- MATCHED = 0;
- } /* while */
- if(INVERSE && !COUNT) {
- if (agrep_finalfp != NULL) {
- while(lastout <= textend) fputc(*lastout++, agrep_finalfp);
- }
- else {
- if (textend - lastout + 1 + agrep_outpointer >= agrep_outlen) {
- OUTPUT_OVERFLOW;
- return -1;
- }
- memcpy(agrep_outbuffer+agrep_outpointer, lastout, text-lastout+1);
- agrep_outpointer += (text-lastout+1);
- lastout = textend;
- }
- }
-
- return 0;
- }
-
- f_prep(pat_index, Pattern)
- unsigned char *Pattern ;
- int pat_index;
- {
- int i, j, m;
- register unsigned hash, Mask=15;
- m = p_size;
- for (i = m-1; i>=(1+LONG); i--) {
- hash = (Pattern[i] & Mask);
- hash = (hash << 4) + (Pattern[i-1]& Mask);
- if(LONG) hash = (hash << 4) + (Pattern[i-2] & Mask);
- if((int)(SHIFT1[hash]) > m-1-i) SHIFT1[hash] = m-1-i;
- }
- if(SHORT) Mask = 255; /* 011111111 */
- hash = 0;
- for(i = m-1; i>=0; i--) {
- hash = (hash << 4) + (tr[Pattern[i]]&Mask);
- }
- /*
- if(INVERSE) hash = Pattern[1];
- */
- hash=hash&mm;
- qt = (struct pat_list *) malloc(sizeof(struct pat_list));
- qt->index = pat_index;
- pt = HASH[hash];
- qt->next = pt;
- HASH[hash] = qt;
- }
-
-
-