home *** CD-ROM | disk | FTP | other *** search
- /*
- * These functions compile a regular expression into an NFA and recognize
- * a pattern.
- *
- * Regular expressions are often used to describe a pattern that might
- * match several different or similar words. An example of a regular
- * expression subset is the wild card characters '*' and '?' used to list
- * files in a directory.
- *
- * Might as well read the books and papers written by those who first wrote
- * the various [a-z]?greps. Ken Thompson was the author of grep. In one
- * weekend, Al Aho wrote egrep and fgrep. Incidentally, when Dr. Aho was
- * the guest speaker at the Distinguished Lecture Series at Georgia Tech,
- * he personally autographed my copy of his book _Compilers: Principles,
- * Techniques, and Tools_, aka the "Dragon Book."
- *
- * See:
- *
- * Ken Thompson, "Regular Expression Search Algorithm." _Communications
- * of the ACM_, 11 (No. 6): 419-422, 1968.
- *
- * Alfred V. Aho, Ravi Sethi, and Jeffrey D. Ullman, _Compilers: Principles,
- * Techniques, and Tools_, Addison-Wesley, Reading, Mass., 1986,
- * Chapter 3, "Lexical Analysis", pp 83-158. ISBN 0-201-10088-6.
- *
- * Robert Sedgewick, _Algorithms in C_, Addison-Wesley, Reading, Mass.,
- * 1990, Chapter 20, "Pattern Matching", pp. 293-303, and Chapter 21,
- * "Parsing", pp. 305-317. ISBN 0-201-51425-7.
- *
- * Andrew Hume, "A Tale of Two Greps." _Software--Practice and Experience_,
- * 18 (No. 11): 1063-1072, 1988.
- *
- *
- * Regular Expressions in TDE
- *
- * The core of the regular expression search algorithm in TDE is based on
- * the nondeterministic finite-state machine in Dr. Segdewick's book.
- * Additional parsing, operator precedence, and nfa construction and
- * simulation info is from Dr. Aho's book. The nfa node naming convention
- * and additional nfa construction guidelines are from Ken Thompson's paper.
- * I'm much too stupid to compile a regular expression into assembly language
- * as suggested in Ken Thompson's paper, but I did get the idea to do the
- * next best thing and added C library functions as NNODE terminal types.
- *
- * The local global NFA is builded with two arrays and a deque. The
- * first array, next1, in the NFA is used to hold the path for lambda
- * transitions. The second array, next2, is used to hold alternate paths.
- * Next1 can hold all types of nodes, but it gets priority for lambda
- * transitions. The deque is a circular array where future states are queued
- * in the head and present states are pushed in the tail.
- *
- * Searching for regular expressions in TDE is not very fast. The worst
- * case search, .*x, where x is any word or letter, is probably the slowest
- * of any implementation with a regular expression search function. However,
- * TDE can handle a large regular expression subset.
- *
- * In version 3.1, I added BOW and EOW (beginning of word and end of word.)
- *
- * In version 3.2, the macro letters were moved to letters.h per Byrial Jensen.
- * We also use the bj_isxxx( ) functions to test whether a letter falls into
- * a predefined macro class.
- *
- * New editor name: TDE, the Thomson-Davis Editor.
- * Author: Frank Davis
- * Date: June 5, 1991, version 1.0
- * Date: July 29, 1991, version 1.1
- * Date: October 5, 1991, version 1.2
- * Date: January 20, 1992, version 1.3
- * Date: February 17, 1992, version 1.4
- * Date: April 1, 1992, version 1.5
- * Date: June 5, 1992, version 2.0
- * Date: October 31, 1992, version 2.1
- * Date: April 1, 1993, version 2.2
- * Date: June 5, 1993, version 3.0
- * Date: August 29, 1993, version 3.1
- * Date: November 13, 1993, version 3.2
- *
- * This code is released into the public domain, Frank Davis.
- * You may use and distribute it freely.
- */
-
- #include "tdestr.h"
- #include "common.h"
- #include "tdefunc.h"
- #include "define.h"
-
- #ifndef min
- #define min(a,b) (((a) < (b)) ? (a) : (b))
- #endif
-
- /*
- * types of nodes in the NFA.
- *
- * let's use the node naming convention in Ken Thompson's reg ex paper.
- */
- #define CNODE 0
- #define NNODE 1
-
- #define SCAN -1
-
- /*
- * types of terminals in NNODEs.
- *
- * starting with simple ASCII terminals, it's easy to build in other types of
- * terminals, e.g. wild chars, BOL, EOL, and character classes. actually,
- * we can easily build ANSI C ctype library function NNODEs.
- */
- #define STRAIGHT_ASCII 0
- #define IGNORE_ASCII 1
- #define WILD 2
- #define BOL 3
- #define EOL 4
- #define CLASS 5
- #define NOTCLASS 6
- #define WHITESPACE 7
- #define ALPHA 8
- #define UPPER 9
- #define LOWER 10
- #define ALPHANUM 11
- #define DECIMAL 12
- #define HEX 13
- #define BOW 14
- #define EOW 15
-
-
- /*
- * types of terminals in CNODEs.
- */
- #define START 0
- #define ACCEPT 1
- #define OR_NODE 2
- #define JUXTA 3
- #define CLOSURE 4
- #define ZERO_OR_ONE 5
-
-
- int lookahead;
- int regx_rc;
- int reg_len;
- int parser_state;
- char class_bits[32];
- int bit[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
- int c1;
- int c2;
- int ttype;
- int regx_error_line;
-
- NFA_TYPE *nfa_pointer;
-
- int nfa_status;
- int search_len;
- int search_col;
- text_ptr search_string;
-
- int queued_states[REGX_SIZE+2];
- int deque[REGX_SIZE*2];
- int dq_head;
- int dq_tail;
- int stacked_node_count;
- int reset_count;
-
-
- /*
- * Name: find_regx
- * Purpose: To set up and find a regular expression.
- * Date: June 5, 1993
- * Passed: window: pointer to current window
- * Notes: Keep the regular expression until changed.
- */
- int find_regx( TDE_WIN *window )
- {
- int direction;
- int new_string;
- long found_line;
- long bin_offset;
- line_list_ptr ll;
- register TDE_WIN *win; /* put window pointer in a register */
- int rc;
- int old_rcol;
- int rcol;
- char answer[MAX_COLS+2];
- char temp[MAX_COLS+2];
-
- switch (g_status.command) {
- case FindRegX :
- new_string = TRUE;
- direction = FORWARD;
- break;
- case RepeatFindRegX :
- new_string = regx.search_defined != OK ? TRUE : FALSE;
- direction = FORWARD;
- break;
- case RepeatFindRegXBackward :
- new_string = regx.search_defined != OK ? TRUE : FALSE;
- direction = BACKWARD;
- break;
- default :
- new_string = 0;
- direction = 0;
- assert( FALSE );
- break;
- }
-
- win = window;
- entab_linebuff( );
- if (un_copy_line( win->ll, win, TRUE ) == ERROR)
- return( ERROR );
-
- regx_error_line = win->bottom_line;
-
- /*
- * get search text, using previous as default
- */
- rc = OK;
- if (new_string == TRUE) {
- *answer = '\0';
- if (regx.search_defined == OK) {
-
- assert( strlen( (char *)regx.pattern ) < (size_t)g_display.ncols );
-
- strcpy( answer, (char *)regx.pattern );
- }
-
- /*
- * string to find:
- */
- if (get_name( reg1, win->bottom_line, answer,
- g_display.message_color ) != OK || *answer == '\0')
- return( ERROR );
- regx.search_defined = OK;
-
- assert( strlen( answer ) < (size_t)g_display.ncols );
-
- strcpy( (char *)regx.pattern, answer );
- rc = build_nfa( );
- if (rc == ERROR)
- regx.search_defined = ERROR;
- }
-
- if (regx.search_defined == OK) {
- old_rcol = win->rcol;
- if (mode.inflate_tabs)
- win->rcol = entab_adjust_rcol( win->ll->line, win->ll->len, win->rcol);
- update_line( win );
- show_search_message( SEARCHING, g_display.diag_color );
- bin_offset = win->bin_offset;
- if (direction == FORWARD) {
- if ((ll = forward_regx_search( win, &found_line, &rcol )) != NULL) {
- if (g_status.wrapped && g_status.macro_executing)
- rc = ask_wrap_replace( win, &new_string );
- if (rc == OK)
- find_adjust( win, ll, found_line, rcol );
- else
- win->bin_offset = bin_offset;
- }
- } else {
- if ((ll = backward_regx_search( win, &found_line, &rcol )) != NULL) {
- if (g_status.wrapped && g_status.macro_executing)
- rc = ask_wrap_replace( win, &new_string );
- if (rc == OK)
- find_adjust( win, ll, found_line, rcol );
- else
- win->bin_offset = bin_offset;
- }
- }
- if (g_status.wrapped)
- show_search_message( WRAPPED, g_display.diag_color );
- else {
- if (nfa_status == OK)
- show_search_message( CLR_SEARCH, g_display.mode_color );
- else
- g_status.wrapped = TRUE;
- }
- if (ll == NULL) {
- /*
- * string not found
- */
- if (mode.inflate_tabs)
- win->rcol = old_rcol;
- combine_strings( temp, find5a, (char *)regx.pattern, find5b );
- error( WARNING, win->bottom_line, temp );
- rc = ERROR;
- }
- show_curl_line( win );
- make_ruler( win );
- show_ruler( win );
- } else {
- /*
- * find pattern not defined
- */
- error( WARNING, win->bottom_line, find6 );
- rc = ERROR;
- }
- return( rc );
- }
-
-
- /*
- * Name: forward_regx_search
- * Purpose: search forward for regular expression
- * Date: June 5, 1993
- * Passed: window: pointer to current window
- * rline: pointer to real line counter
- * rcol: pointer to real column variable
- * Returns: position in file if found or NULL if not found
- * Notes: Start searching from cursor position to end of file. If we hit
- * end of file without a match, start searching from the beginning
- * of file to cursor position. (do wrapped searches)
- */
- line_list_ptr forward_regx_search( TDE_WIN *window, long *rline, int *rcol )
- {
- register int len;
- int i;
- int end;
- register TDE_WIN *win; /* put window pointer in a register */
- line_list_ptr ll;
-
- /*
- * if cursor is beyond end of line then start at end of line
- */
- win = window;
- *rline = win->rline;
- i = win->rcol + 1;
- ll = win->ll;
- len = ll->len;
- if (i > len && len != EOF) {
- ll = ll->next;
- ++*rline;
- i = 0;
- }
- if (i < 0)
- i = 0;
-
- *rcol = i;
- ll = regx_search_forward( ll, rline, rcol );
-
- if (ll == NULL) {
-
- end = 0;
- if (win->ll->next != NULL) {
- end = win->ll->next->len;
- win->ll->next->len = EOF;
- }
-
- /*
- * haven't found pattern yet - now search from beginning of file
- */
- g_status.wrapped = TRUE;
-
- *rcol = 0;
- *rline = 1L;
- ll = regx_search_forward( win->file_info->line_list, rline, rcol );
-
- if (ll == win->ll && *rcol >= win->rcol)
- ll = NULL;
-
- if (win->ll->next != NULL)
- win->ll->next->len = end;
- }
- flush_keyboard( );
-
- if (ll != NULL)
- bin_offset_adjust( win, *rline );
- return( ll );
- }
-
-
- /*
- * Name: regx_search_forward
- * Purpose: search forward for pattern using nfa
- * Date: June 5, 1993
- * Passed: ll: pointer to current node in linked list
- * rline: pointer to real line counter
- * col: column in ll->line to begin search
- * Returns: position in file if found or NULL if not found
- * Notes: run the nfa machine on each character in each line
- */
- line_list_ptr regx_search_forward( line_list_ptr ll, long *rline, int *col )
- {
- if (ll->len == EOF)
- return( NULL );
- else {
- switch (g_status.command) {
- case DefineRegXGrep :
- case RepeatGrep :
- nfa_pointer = &sas_nfa;
- stacked_node_count = sas_regx.node_count;
- break;
- case FindRegX :
- case RepeatFindRegX :
- nfa_pointer = &nfa;
- stacked_node_count = regx.node_count;
- break;
- default :
- assert( FALSE );
- break;
- }
- nfa_status = OK;
- search_string = ll->line;
- search_len = ll->len;
- search_col = *col;
- reset_count = stacked_node_count * sizeof(int);
- for (; !g_status.control_break;) {
- for (; search_col <= search_len; search_col++) {
- if (nfa_match( ) != ERROR) {
- *col = search_col;
- return( ll );
- }
- }
- ++*rline;
- ll = ll->next;
- search_string = ll->line;
- if (ll->len == EOF)
- return( NULL );
- search_len = ll->len;
- search_col = 0;
- }
- return( NULL );
- }
- }
-
-
- /*
- * Name: backward_regx_search
- * Purpose: search backward for regular expression
- * Date: June 5, 1993
- * Passed: window: pointer to current window
- * rline: pointer to real line counter
- * rcol: pointer to real column variable
- * Returns: position in file if found or NULL if not found
- * Notes: Start searching from cursor position to end of file. If we hit
- * end of file without a match, start searching from the beginning
- * of file to cursor position. (do wrapped searches)
- */
- line_list_ptr backward_regx_search( TDE_WIN *window, long *rline, int *rcol )
- {
- int i;
- int len;
- int end;
- register TDE_WIN *win; /* put window pointer in a register */
- line_list_ptr ll;
-
- win = window;
- *rline = win->rline;
-
- /*
- * see if cursor is on EOF line. if so, move search start to previous node.
- */
- if (win->ll->len != EOF) {
- ll = win->ll;
- i = win->rcol - 1;
- len = ll->len;
- if (i >= len)
- i = len - 1;
- } else {
- ll = win->ll->prev;
- --*rline;
- i = 0;
- if (ll != NULL)
- i = ll->len - 1;
- }
- *rcol = i;
- ll = regx_search_backward( ll, rline, rcol );
-
- if (ll == NULL && win->rline <= win->file_info->length) {
-
- end = 0;
- if (win->ll->prev != NULL) {
- end = win->ll->prev->len;
- win->ll->prev->len = EOF;
- }
-
- /*
- * haven't found pattern yet - now search from end of file
- */
- g_status.wrapped = TRUE;
- ll = win->file_info->line_list_end;
- if (ll->prev != NULL)
- *rcol = ll->prev->len;
- else
- *rcol = 0;
- *rline = win->file_info->length;
- ll = regx_search_backward( ll->prev, rline, rcol );
-
- if (ll == win->ll && *rcol <= win->rcol)
- ll = NULL;
-
- if (win->ll->prev != NULL)
- win->ll->prev->len = end;
- }
- flush_keyboard( );
-
- if (ll != NULL)
- bin_offset_adjust( win, *rline );
- return( ll );
- }
-
-
- /*
- * Name: regx_search_backward
- * Purpose: search backward for pattern using regular expression
- * Date: June 5, 1993
- * Passed: ll: pointer to current node in linked list
- * rline: pointer to real line counter
- * col: column in ll->line to begin search
- * Returns: position in file if found or NULL if not found
- * Notes: run the nfa machine on each character in each line
- */
- line_list_ptr regx_search_backward( line_list_ptr ll, long *rline, int *col )
- {
- if (ll == NULL)
- return( NULL );
- if (ll->len == EOF)
- return( NULL );
- else {
- nfa_pointer = &nfa;
- stacked_node_count = regx.node_count;
-
- search_string = ll->line;
- search_len = ll->len;
- search_col = *col;
- reset_count = stacked_node_count * sizeof(int);
- while (!g_status.control_break) {
- for (; search_col >= 0; search_col--) {
- if (nfa_match( ) != ERROR) {
- *col = search_col;
- return( ll );
- }
- }
- --*rline;
- ll = ll->prev;
- if (ll == NULL)
- return( NULL );
- if (ll->len == EOF)
- return( NULL );
- search_string = ll->line;
- search_col = search_len = ll->len;
- }
- return( NULL );
- }
- }
-
-
- /****************************** NFA Recognizer *********************/
-
-
- /*
- * Name: nfa_match
- * Purpose: try to recognize a pattern
- * Date: June 5, 1993
- * Passed: none, but modifies local global nfa recognizer.
- * Returns: length of recognized pattern if found or ERROR if not found.
- */
- int nfa_match( void )
- {
- register int c;
- int state;
- int j;
- int n1;
- int rc;
-
- j = search_col;
- c = j < search_len ? search_string[j] : EOF;
- state = nfa_pointer->next1[0];
- dq_head = 0;
- dq_tail = 0;
- memset( queued_states, 0, reset_count );
- put_dq( SCAN );
-
- while (state) {
- if (state == SCAN) {
- memset( queued_states, 0, reset_count );
- j++;
- put_dq( SCAN );
- c = j < search_len ? search_string[j] : EOF;
- } else if (nfa_pointer->node_type[state] == NNODE) {
- n1 = nfa_pointer->next1[state];
- rc = OK;
- switch (nfa_pointer->term_type[state]) {
- case STRAIGHT_ASCII :
- if (nfa_pointer->c[state] == c)
- rc = put_dq( n1 );
- break;
- case IGNORE_ASCII :
- if (nfa_pointer->c[state] == tolower( c ))
- rc = put_dq( n1 );
- break;
- case WILD :
- if (j < search_len)
- rc = put_dq( n1 );
- break;
- case BOL :
- if (j == 0) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- }
- break;
- case EOL :
- if (j == search_len) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- }
- break;
- case CLASS :
- if (c != EOF && nfa_pointer->class[state][c/8] & bit[c%8])
- rc = put_dq( n1 );
- break;
- case NOTCLASS :
- if (c != EOF && !(nfa_pointer->class[state][c/8] & bit[c%8]))
- rc = put_dq( n1 );
- break;
- case WHITESPACE :
- if (c == ' ' || c == '\t')
- rc = put_dq( n1 );
- break;
- case ALPHA :
- if (c != EOF && bj_isalpha( c ))
- rc = put_dq( n1 );
- break;
- case UPPER :
- if (c != EOF && bj_isupper( c ))
- rc = put_dq( n1 );
- break;
- case LOWER :
- if (c != EOF && bj_islower( c ))
- rc = put_dq( n1 );
- break;
- case ALPHANUM :
- if (c != EOF && bj_isalnum( c ))
- rc = put_dq( n1 );
- break;
- case DECIMAL :
- if (c != EOF && bj_isdigit( c ))
- rc = put_dq( n1 );
- break;
- case HEX :
- if (c != EOF && bj_isxdigit( c ))
- rc = put_dq( n1 );
- break;
- case BOW :
- if (c != EOF) {
- if (!myiswhitespc( c )) {
- if (j == 0) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- } else if (j > 0) {
- if (myiswhitespc( search_string[j-1] )) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- }
- }
- }
- }
- break;
- case EOW :
- if (c == EOF) {
- if (search_len > 0) {
- if (!myiswhitespc( search_string[search_len-1] )) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- }
- }
- } else {
- if (myiswhitespc( c ) && j > 0) {
- if (!myiswhitespc( search_string[j-1] )) {
- rc = put_dq( n1 );
- if (deque[dq_tail] == SCAN)
- --j;
- }
- }
- }
- break;
- default :
- assert( FALSE );
- break;
- }
- if (rc == ERROR)
- return( 0 );
- } else {
- assert( nfa_pointer->node_type[state] == CNODE );
-
- n1 = nfa_pointer->next1[state];
- if (push_dq( n1 ) == ERROR)
- return( 0 );
-
- if (n1 != nfa_pointer->next2[state])
- if (push_dq( nfa_pointer->next2[state] ) == ERROR)
- return( 0 );
- }
- if (dequeempty( ) || j > search_len)
- return( ERROR );
- state = pop_dq( );
- }
- return( j - search_col );
- }
-
-
- /*
- * Name: put_dq
- * Purpose: queue future states
- * Date: June 5, 1993
- * Passed: v: state to queue
- * Returns: none, but modifies local global deque variables. future
- * states are queued in the head.
- * Notes: do not add any duplicate states to the head of the deque.
- */
- int put_dq( int v )
- {
- if (queued_states[v+1] == 0) {
- ++queued_states[v+1];
- deque[dq_head] = v;
- dq_head--;
- if (dq_head < 0)
- dq_head = REGX_SIZE - 1;
- if (dq_tail == dq_head) {
- nfa_status = ERROR;
- show_search_message( NFA_GAVE_UP, g_display.diag_color );
- return( ERROR );
- }
- }
- return( OK );
- }
-
-
- /*
- * Name: push_dq
- * Purpose: push state on deque
- * Date: June 5, 1993
- * Passed: v: state to push
- * Returns: whether stack is OK or if stack overflowed - ERROR. present
- * states are stacked.
- */
- int push_dq( int v )
- {
- ++dq_tail;
- if (dq_tail == dq_head) {
- nfa_status = ERROR;
- show_search_message( NFA_GAVE_UP, g_display.diag_color );
- return( ERROR );
- } else {
- if (dq_tail > REGX_SIZE - 1)
- dq_tail = 0;
- deque[dq_tail] = v;
- return( OK );
- }
- }
-
-
- /*
- * Name: pop_dq
- * Purpose: pop next state on dq
- * Date: June 5, 1993
- * Passed: none, but looks at local global nfa recognizer
- * Returns: state on top of deque and decrements stack pointer
- */
- int pop_dq( void )
- {
- register int v;
-
- v = deque[dq_tail];
- dq_tail--;
- if (dq_tail < 0)
- dq_tail = REGX_SIZE - 1;
- return( v );
- }
-
-
- /*
- * Name: dequeempty
- * Purpose: any room on dq?
- * Date: June 5, 1993
- * Passed: none, but looks at local global nfa recognizer
- * Returns: boolean, TRUE if empty.
- * Notes: the deque is a circular array where future states are
- * queued in the head and present states are pushed in the tail.
- */
- int dequeempty( void )
- {
- if (dq_tail > dq_head)
- return( dq_tail - dq_head == 1 );
- else
- return( dq_tail + REGX_SIZE - dq_head == 1 );
- }
-
-
- /*************************** NFA Compiler **************************/
-
-
- /*
- * Name: build_nfa
- * Purpose: initialize nfa and call the compiler
- * Date: June 5, 1993
- * Passed: none, but looks at local global pattern.
- * Returns: whether or not an ERROR occured
- * Notes: sets up the initial variables for the compiler. builds
- * initial and acceptor states for the nfa after compiler finishes.
- */
- int build_nfa( void )
- {
- regx_rc = OK;
-
- init_nfa( );
- lookahead = 0;
- reg_len = strlen( (char *)regx.pattern );
- parser_state = 1;
-
- nfa.next1[0] = expression( );
- if (regx_rc == OK) {
- emit_cnode( 0, START, nfa.next1[0], nfa.next1[0] );
- emit_cnode( parser_state, ACCEPT, 0, 0 );
- regx.node_count = parser_state + 2;
- }
-
- if (g_status.command == DefineRegXGrep) {
- memcpy( &sas_nfa, &nfa, sizeof(NFA_TYPE) );
- memcpy( &sas_regx, ®x, sizeof(REGX_INFO) );
- }
- return( regx_rc );
- }
-
-
- /*
- * Name: expression
- * Purpose: break reg ex into terms and expressions
- * Date: June 5, 1993
- * Passed: none, but looks at local global pattern.
- * Returns: none
- * Notes: because recursion can go fairly deep, almost all variables
- * were made local global. no need to save most of this stuff
- * on the stack.
- */
- int expression( void )
- {
- int t1;
- int t2;
- int r;
-
- t1 = term( );
- r = t1;
- if (regx.pattern[lookahead] == '|') {
- lookahead++;
- parser_state++;
- r = t2 = parser_state;
- parser_state++;
- emit_cnode( t2, OR_NODE, expression( ), t1 );
- emit_cnode( t2-1, JUXTA, parser_state, parser_state );
-
- /*
- * the OR_NODE seems to need a path from the previous node
- */
- if (nfa.node_type[t1] == CNODE)
- t1 = min( nfa.next1[t1], nfa.next2[t1] );
- nfa.next1[t1-1] = t2;
- if (nfa.node_type[t1-1] == NNODE)
- nfa.next2[t1-1] = t2;
- }
- return( r );
- }
-
-
- /*
- * Name: term
- * Purpose: break reg ex into factors and terms
- * Date: June 5, 1993
- * Passed: none, but looks at local global pattern.
- * Returns: none
- */
- int term( void )
- {
- int r;
-
- r = factor( );
- if (regx.pattern[lookahead] == '(' || letter( regx.pattern[lookahead] ))
- term( );
- else if (Kleene_star( regx.pattern[lookahead] ))
- regx_error( reg11 );
- return( r );
- }
-
-
- /*
- * Name: factor
- * Purpose: add NNODEs and CNODEs to local global nfa
- * Date: June 5, 1993
- * Passed: none
- * Returns: none, but modifies local global nfa.
- * Notes: this function does most of the compiling. it recognizes all
- * NNODEs, predefined macros, escape characters, and operators.
- */
- int factor( void )
- {
- int t1;
- int t2;
- int r;
- int c;
- int paren;
-
- t2 = t1 = parser_state;
- paren = FALSE;
- c = regx.pattern[lookahead];
- if (c == '(') {
- lookahead++;
- t2 = expression( );
- if (regx.pattern[lookahead] == ')') {
- lookahead++;
- paren = TRUE;
- } else
-
- /*
- * unmatched open parens
- */
- regx_error( reg2 );
- } else if (letter( c )) {
- paren = FALSE;
- switch (c) {
- case ']' :
-
- /*
- * unmatched close bracket
- */
- regx_error( reg9 );
- break;
- case '.' :
- ttype = WILD;
- break;
- case ',' :
- ttype = WHITESPACE;
- break;
- case '^' :
- ttype = BOL;
- break;
- case '$' :
- ttype = EOL;
- break;
- case '<' :
- ttype = BOW;
- break;
- case '>' :
- ttype = EOW;
- break;
- case '\\' :
- ++lookahead;
- ttype = mode.search_case == IGNORE ? IGNORE_ASCII : STRAIGHT_ASCII;
- if (lookahead != '\0') {
- if (regx.pattern[lookahead] != ':')
- c = escape_char( regx.pattern[lookahead] );
-
- /*
- * predefined unix-like macros.
- */
- else {
- c = regx.pattern[lookahead+1];
- if (c != '\0') {
- switch (c) {
- case REG_ALPHANUM :
- ++lookahead;
- ttype = ALPHANUM;
- break;
- case REG_WHITESPACE :
- ++lookahead;
- ttype = WHITESPACE;
- break;
- case REG_ALPHA :
- ++lookahead;
- ttype = ALPHA;
- break;
- case REG_DECIMAL :
- ++lookahead;
- ttype = DECIMAL;
- break;
- case REG_HEX :
- ++lookahead;
- ttype = HEX;
- break;
- case REG_LOWER :
- ++lookahead;
- ttype = LOWER;
- break;
- case REG_UPPER :
- ++lookahead;
- ttype = UPPER;
- break;
- default :
- c = escape_char( regx.pattern[lookahead] );
- break;
- }
- } else
- c = escape_char( regx.pattern[lookahead] );
- }
- } else
- regx_error( reg4 );
- break;
- case '[' :
- memset( class_bits, 0, sizeof(char) * 32 );
- ++lookahead;
- if (lookahead != '\0') {
- c = regx.pattern[lookahead];
- if (c == '^') {
- ++lookahead;
- ttype = NOTCLASS;
- } else
- ttype = CLASS;
-
- c1 = regx.pattern[lookahead];
- do {
- class_bits[c1/8] |= bit[c1%8];
- if (c1 != '\0')
- ++lookahead;
- if (regx.pattern[lookahead] == '-') {
- ++lookahead;
- c2 = regx.pattern[lookahead];
- if (c2 != '\0') {
-
- /*
- * just in case the hi for the range is given first,
- * switch c1 and c2, e.g. [9-0].
- */
- if (c2 < c1) {
- c = c2;
- c2 = c1;
- c1 = c;
- }
-
- for (c=c1; c <= c2; c++)
- class_bits[c/8] |= bit[c%8];
-
- if (regx.pattern[lookahead] != '\0')
- ++lookahead;
- } else
- regx_error( reg10 );
- }
- c1 = regx.pattern[lookahead];
- } while (c1 != '\0' && c1 != ']');
-
- if (c1 == '\0')
- regx_error( reg5 );
- } else
- regx_error( reg6 );
- break;
- default :
- if (mode.search_case == IGNORE) {
- c = tolower( c );
- ttype = IGNORE_ASCII;
- } else
- ttype = STRAIGHT_ASCII;
- }
- emit_nnode( parser_state, ttype, c, parser_state+1, parser_state+1 );
- if (ttype == CLASS || ttype == NOTCLASS) {
- nfa.class[parser_state] = calloc( 32, sizeof(char) );
- if (nfa.class[parser_state] != NULL)
- memcpy( nfa.class[parser_state], class_bits, sizeof( char )*32 );
- else
- regx_error( reg7 );
- }
- t2 = parser_state;
- lookahead++;
- parser_state++;
- } else if (c == '\0')
- return( 0 );
- else {
- if (c == '*' || c == '+' || c == '?')
- regx_error( reg8 );
- else if (c == ')')
- regx_error( reg3 );
- else
- regx_error( reg2 );
- }
-
- c = regx.pattern[lookahead];
- switch (c) {
- case '*' :
- emit_cnode( parser_state, CLOSURE, parser_state+1, t2 );
- r = parser_state;
- if (nfa.node_type[t1] == CNODE)
- t1 = min( nfa.next1[t1], nfa.next2[t1] );
- nfa.next1[t1-1] = parser_state;
- if (nfa.node_type[t1-1] == NNODE)
- nfa.next2[t1-1] = parser_state;
- lookahead++;
- parser_state++;
- paren = FALSE;
- break;
- case '+' :
- if (paren == TRUE) {
- emit_cnode( parser_state, JUXTA, parser_state+2, parser_state+2 );
- parser_state++;
- }
-
- emit_cnode( parser_state, JUXTA, t2, t2 );
- r = parser_state;
- parser_state++;
-
- if (paren == FALSE) {
- nfa.next1[t2] = parser_state;
- if (nfa.node_type[t2] == NNODE)
- nfa.next2[t2] = parser_state;
- }
-
- emit_cnode( parser_state, CLOSURE, parser_state+1, t2 );
- if (nfa.node_type[t1] == CNODE)
- t1 = min( nfa.next1[t1], nfa.next2[t1] );
- nfa.next1[t1-1] = r;
- if (nfa.node_type[t1-1] == NNODE)
- nfa.next2[t1-1] = r;
- parser_state++;
- lookahead++;
- paren = FALSE;
- break;
- case '?' :
- emit_cnode( parser_state, JUXTA, parser_state+2, parser_state+2 );
- parser_state++;
- r = parser_state;
- emit_cnode( parser_state, ZERO_OR_ONE, parser_state+1, t2 );
- if (nfa.node_type[t1] == CNODE)
- t1 = min( nfa.next1[t1], nfa.next2[t1] );
- nfa.next1[t1-1] = parser_state;
- if (nfa.node_type[t1-1] == NNODE)
- nfa.next2[t1-1] = parser_state;
- parser_state++;
- lookahead++;
- paren = FALSE;
- break;
- default :
- r = t2;
- break;
- }
-
- /*
- * close parens seem to need a JUXTA node to gather all reg ex's
- * to a common point.
- */
- if (paren) {
- emit_cnode( parser_state, JUXTA, parser_state+1, parser_state+1 );
- parser_state++;
- }
- return( r );
- }
-
-
- /*
- * Name: escape_char
- * Purpose: recognize escape and C escape sequences
- * Date: June 5, 1993
- * Passed: let: letter to escape
- * Returns: escaped letter
- */
- int escape_char( int let )
- {
- switch (let) {
- case '0' :
- let = 0x00;
- break;
- case 'a' :
- let = 0x07;
- break;
- case 'b' :
- let = 0x08;
- break;
- case 'n' :
- let = 0x0a;
- break;
- case 'r' :
- let = 0x0d;
- break;
- case 't' :
- let = 0x09;
- break;
- default :
- break;
- }
- return( let );
- }
-
-
- /*
- * Name: emit_cnode
- * Purpose: add a null node to our pattern matching machine
- * Date: June 5, 1993
- * Passed: index: current node in nfa
- * ttype: terminal type - CLOSURE, OR, JUXTA, etc...
- * n1: pointer to next state, path for lambda transitions
- * n2: pointer to other next state, usually a NNODE
- * Returns: none, but modifies local global nfa.
- */
- void emit_cnode( int index, int ttype, int n1, int n2 )
- {
- assert( index >= 0);
- assert( index < REGX_SIZE );
-
- nfa.node_type[index] = CNODE;
- nfa.term_type[index] = ttype;
- nfa.c[index] = 0;
- nfa.next1[index] = n1;
- nfa.next2[index] = n2;
- }
-
-
- /*
- * Name: emit_nnode
- * Purpose: add a to our pattern matching machine
- * Date: June 5, 1993
- * Passed: index: current node in nfa
- * ttype: terminal type - EOL, ASCII, etc...
- * c: letter this node recognizes
- * n1: pointer to next state
- * n2: pointer to other next state, which can be same as n1
- * Returns: none, but modifies local global nfa.
- */
- void emit_nnode( int index, int ttype, int c, int n1, int n2 )
- {
- assert( index >= 0);
- assert( index < REGX_SIZE );
-
- nfa.node_type[index] = NNODE;
- nfa.term_type[index] = ttype;
- nfa.c[index] = c;
- nfa.next1[index] = n1;
- nfa.next2[index] = n2;
- }
-
-
- /*
- * Name: init_nfa
- * Purpose: set local global nfa to NULL state
- * Date: June 5, 1993
- * Passed: none
- */
- void init_nfa( void )
- {
- int i;
-
- for (i=0; i < REGX_SIZE; i++) {
- nfa.node_type[i] = NNODE;
- nfa.term_type[i] = 0;
- nfa.c[i] = 0;
- nfa.next1[i] = 0;
- nfa.next2[i] = 0;
- if (nfa.class[i] != NULL)
- free( nfa.class[i] );
- nfa.class[i] = NULL;
- }
- }
-
-
- /*
- * Name: regx_error
- * Purpose: display reg ex error message and set reg ex error code
- * Date: June 5, 1993
- * Passed: line: line to display error
- * Returns: none, but sets reg ex return code to error.
- */
- void regx_error( char *line )
- {
- error( WARNING, regx_error_line, line );
- regx_rc = ERROR;
- }
-
-
- /*
- * Name: separator
- * Purpose: determine if character is a reg ex separator
- * Date: June 5, 1993
- * Passed: let: letter to look at
- * Returns: whether or not 'let' is a separator
- */
- int separator( int let )
- {
- return( let == 0 || let == ')' || let == '|' );
- }
-
-
- /*
- * Name: Kleene_star
- * Purpose: determine if character is a reg ex operator
- * Date: June 5, 1993
- * Passed: let: letter to look at
- * Returns: whether or not 'let' is a letter
- */
- int Kleene_star( int let )
- {
- return( let == '*' || let == '+' || let == '?' );
- }
-
-
- /*
- * Name: letter
- * Purpose: determine if character is a recognized reg ex character
- * Date: June 5, 1993
- * Passed: let: letter to look at
- * Returns: whether or not 'let' is a letter.
- */
- int letter( int let )
- {
- return( !separator( let ) && !Kleene_star( let ) );
- }
-